diff options
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td | 62 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 63 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/GCNSchedStrategy.h | 6 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/VOP1Instructions.td | 11 | ||||
-rw-r--r-- | llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp | 85 | ||||
-rw-r--r-- | llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td | 37 | ||||
-rw-r--r-- | llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp | 4 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 23 |
8 files changed, 169 insertions, 122 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 1e30735..36c9cb6 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -707,16 +707,14 @@ let Predicates = [HasSVE_or_SME] in { defm SDOT_ZZZ : sve_intx_dot<0b0, "sdot", AArch64sdot>; defm UDOT_ZZZ : sve_intx_dot<0b1, "udot", AArch64udot>; - let Predicates = [HasSVE_or_SME] in { - def : Pat<(nxv4i32 (partial_reduce_umla nxv4i32:$Acc, nxv16i8:$MulLHS, nxv16i8:$MulRHS)), - (UDOT_ZZZ_BtoS $Acc, $MulLHS, $MulRHS)>; - def : Pat<(nxv4i32 (partial_reduce_smla nxv4i32:$Acc, nxv16i8:$MulLHS, nxv16i8:$MulRHS)), - (SDOT_ZZZ_BtoS $Acc, $MulLHS, $MulRHS)>; - def : Pat<(nxv2i64 (partial_reduce_umla nxv2i64:$Acc, nxv8i16:$MulLHS, nxv8i16:$MulRHS)), - (UDOT_ZZZ_HtoD $Acc, $MulLHS, $MulRHS)>; - def : Pat<(nxv2i64 (partial_reduce_smla nxv2i64:$Acc, nxv8i16:$MulLHS, nxv8i16:$MulRHS)), - (SDOT_ZZZ_HtoD $Acc, $MulLHS, $MulRHS)>; - } // End HasSVE_or_SME + def : Pat<(nxv4i32 (partial_reduce_umla nxv4i32:$Acc, nxv16i8:$MulLHS, nxv16i8:$MulRHS)), + (UDOT_ZZZ_BtoS $Acc, $MulLHS, $MulRHS)>; + def : Pat<(nxv4i32 (partial_reduce_smla nxv4i32:$Acc, nxv16i8:$MulLHS, nxv16i8:$MulRHS)), + (SDOT_ZZZ_BtoS $Acc, $MulLHS, $MulRHS)>; + def : Pat<(nxv2i64 (partial_reduce_umla nxv2i64:$Acc, nxv8i16:$MulLHS, nxv8i16:$MulRHS)), + (UDOT_ZZZ_HtoD $Acc, $MulLHS, $MulRHS)>; + def : Pat<(nxv2i64 (partial_reduce_smla nxv2i64:$Acc, nxv8i16:$MulLHS, nxv8i16:$MulRHS)), + (SDOT_ZZZ_HtoD $Acc, $MulLHS, $MulRHS)>; defm SDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b0, "sdot", int_aarch64_sve_sdot_lane>; defm UDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b1, "udot", int_aarch64_sve_udot_lane>; @@ -3646,6 +3644,9 @@ let Predicates = [HasSVE_or_SME, HasMatMulInt8] in { defm USDOT_ZZZ : sve_int_dot_mixed<"usdot", AArch64usdot>; defm USDOT_ZZZI : sve_int_dot_mixed_indexed<0, "usdot", int_aarch64_sve_usdot_lane>; defm SUDOT_ZZZI : sve_int_dot_mixed_indexed<1, "sudot", int_aarch64_sve_sudot_lane>; + + def : Pat<(nxv4i32 (partial_reduce_sumla nxv4i32:$Acc, nxv16i8:$LHS, nxv16i8:$RHS)), + (USDOT_ZZZ $Acc, $RHS, $LHS)>; } // End HasSVE_or_SME, HasMatMulInt8 let Predicates = [HasSVE, HasMatMulFP32] in { @@ -3752,6 +3753,19 @@ let Predicates = [HasSVE2_or_SME] in { defm UMLSLB_ZZZ : sve2_int_mla_long<0b10110, "umlslb", int_aarch64_sve_umlslb>; defm UMLSLT_ZZZ : sve2_int_mla_long<0b10111, "umlslt", int_aarch64_sve_umlslt>; + def : Pat<(nxv2i64 (partial_reduce_umla nxv2i64:$Acc, nxv4i32:$LHS, nxv4i32:$RHS)), + (UMLALT_ZZZ_D (UMLALB_ZZZ_D $Acc, $LHS, $RHS), $LHS, $RHS)>; + def : Pat<(nxv2i64 (partial_reduce_smla nxv2i64:$Acc, nxv4i32:$LHS, nxv4i32:$RHS)), + (SMLALT_ZZZ_D (SMLALB_ZZZ_D $Acc, $LHS, $RHS), $LHS, $RHS)>; + def : Pat<(nxv4i32 (partial_reduce_umla nxv4i32:$Acc, nxv8i16:$LHS, nxv8i16:$RHS)), + (UMLALT_ZZZ_S (UMLALB_ZZZ_S $Acc, $LHS, $RHS), $LHS, $RHS)>; + def : Pat<(nxv4i32 (partial_reduce_smla nxv4i32:$Acc, nxv8i16:$LHS, nxv8i16:$RHS)), + (SMLALT_ZZZ_S (SMLALB_ZZZ_S $Acc, $LHS, $RHS), $LHS, $RHS)>; + def : Pat<(nxv8i16 (partial_reduce_umla nxv8i16:$Acc, nxv16i8:$LHS, nxv16i8:$RHS)), + (UMLALT_ZZZ_H (UMLALB_ZZZ_H $Acc, $LHS, $RHS), $LHS, $RHS)>; + def : Pat<(nxv8i16 (partial_reduce_smla nxv8i16:$Acc, nxv16i8:$LHS, nxv16i8:$RHS)), + (SMLALT_ZZZ_H (SMLALB_ZZZ_H $Acc, $LHS, $RHS), $LHS, $RHS)>; + // SVE2 saturating multiply-add long (indexed) defm SQDMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0100, "sqdmlalb", int_aarch64_sve_sqdmlalb_lane>; defm SQDMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0101, "sqdmlalt", int_aarch64_sve_sqdmlalt_lane>; @@ -3880,19 +3894,6 @@ let Predicates = [HasSVE2_or_SME] in { def : Pat<(nxv8i16 (partial_reduce_smla nxv8i16:$Acc, nxv16i8:$Input, (nxv16i8 (splat_vector (i32 1))))), (SADDWT_ZZZ_H (SADDWB_ZZZ_H $Acc, $Input), $Input)>; - def : Pat<(nxv2i64 (partial_reduce_umla nxv2i64:$Acc, nxv4i32:$LHS, nxv4i32:$RHS)), - (UMLALT_ZZZ_D (UMLALB_ZZZ_D $Acc, $LHS, $RHS), $LHS, $RHS)>; - def : Pat<(nxv2i64 (partial_reduce_smla nxv2i64:$Acc, nxv4i32:$LHS, nxv4i32:$RHS)), - (SMLALT_ZZZ_D (SMLALB_ZZZ_D $Acc, $LHS, $RHS), $LHS, $RHS)>; - def : Pat<(nxv4i32 (partial_reduce_umla nxv4i32:$Acc, nxv8i16:$LHS, nxv8i16:$RHS)), - (UMLALT_ZZZ_S (UMLALB_ZZZ_S $Acc, $LHS, $RHS), $LHS, $RHS)>; - def : Pat<(nxv4i32 (partial_reduce_smla nxv4i32:$Acc, nxv8i16:$LHS, nxv8i16:$RHS)), - (SMLALT_ZZZ_S (SMLALB_ZZZ_S $Acc, $LHS, $RHS), $LHS, $RHS)>; - def : Pat<(nxv8i16 (partial_reduce_umla nxv8i16:$Acc, nxv16i8:$LHS, nxv16i8:$RHS)), - (UMLALT_ZZZ_H (UMLALB_ZZZ_H $Acc, $LHS, $RHS), $LHS, $RHS)>; - def : Pat<(nxv8i16 (partial_reduce_smla nxv8i16:$Acc, nxv16i8:$LHS, nxv16i8:$RHS)), - (SMLALT_ZZZ_H (SMLALB_ZZZ_H $Acc, $LHS, $RHS), $LHS, $RHS)>; - // SVE2 integer multiply long defm SQDMULLB_ZZZ : sve2_wide_int_arith_long<0b11000, "sqdmullb", int_aarch64_sve_sqdmullb>; defm SQDMULLT_ZZZ : sve2_wide_int_arith_long<0b11001, "sqdmullt", int_aarch64_sve_sqdmullt>; @@ -4200,11 +4201,6 @@ let Predicates = [HasSVEAES2, HasNonStreamingSVE_or_SSVE_AES] in { def PMULL_2ZZZ_Q : sve_crypto_pmull_multi<"pmull">; } -let Predicates = [HasSVE_or_SME, HasMatMulInt8] in { - def : Pat<(nxv4i32 (partial_reduce_sumla nxv4i32:$Acc, nxv16i8:$LHS, nxv16i8:$RHS)), - (USDOT_ZZZ $Acc, $RHS, $LHS)>; - } // End HasSVE_or_SME, HasMatMulInt8 - //===----------------------------------------------------------------------===// // SME or SVE2.1 instructions //===----------------------------------------------------------------------===// @@ -4238,12 +4234,10 @@ defm UDOT_ZZZ_HtoS : sve2p1_two_way_dot_vv<"udot", 0b1, int_aarch64_sve_udot_x2 defm SDOT_ZZZI_HtoS : sve2p1_two_way_dot_vvi<"sdot", 0b0, int_aarch64_sve_sdot_lane_x2>; defm UDOT_ZZZI_HtoS : sve2p1_two_way_dot_vvi<"udot", 0b1, int_aarch64_sve_udot_lane_x2>; -let Predicates = [HasSVE2p1_or_SME2] in { - def : Pat<(nxv4i32 (partial_reduce_umla nxv4i32:$Acc, nxv8i16:$MulLHS, nxv8i16:$MulRHS)), - (UDOT_ZZZ_HtoS $Acc, $MulLHS, $MulRHS)>; - def : Pat<(nxv4i32 (partial_reduce_smla nxv4i32:$Acc, nxv8i16:$MulLHS, nxv8i16:$MulRHS)), - (SDOT_ZZZ_HtoS $Acc, $MulLHS, $MulRHS)>; -} // End HasSVE2p1_or_SME2 +def : Pat<(nxv4i32 (partial_reduce_umla nxv4i32:$Acc, nxv8i16:$MulLHS, nxv8i16:$MulRHS)), + (UDOT_ZZZ_HtoS $Acc, $MulLHS, $MulRHS)>; +def : Pat<(nxv4i32 (partial_reduce_smla nxv4i32:$Acc, nxv8i16:$MulLHS, nxv8i16:$MulRHS)), + (SDOT_ZZZ_HtoS $Acc, $MulLHS, $MulRHS)>; defm SQCVTN_Z2Z_StoH : sve2p1_multi_vec_extract_narrow<"sqcvtn", 0b00, int_aarch64_sve_sqcvtn_x2>; defm UQCVTN_Z2Z_StoH : sve2p1_multi_vec_extract_narrow<"uqcvtn", 0b01, int_aarch64_sve_uqcvtn_x2>; diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index fab78a9..bdc0810 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -29,6 +29,7 @@ #include "SIMachineFunctionInfo.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/CalcSpillWeights.h" #include "llvm/CodeGen/RegisterClassInfo.h" #include "llvm/MC/LaneBitmask.h" #include "llvm/Support/ErrorHandling.h" @@ -1633,64 +1634,6 @@ void GCNSchedStage::revertScheduling() { DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd); } -bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat, - SlotIndex OriginalIdx, - SlotIndex RematIdx) const { - - LiveIntervals *LIS = DAG.LIS; - MachineRegisterInfo &MRI = DAG.MRI; - OriginalIdx = OriginalIdx.getRegSlot(true); - RematIdx = std::max(RematIdx, RematIdx.getRegSlot(true)); - for (const MachineOperand &MO : InstToRemat->operands()) { - if (!MO.isReg() || !MO.getReg() || !MO.readsReg()) - continue; - - if (!MO.getReg().isVirtual()) { - // Do not attempt to reason about PhysRegs - // TODO: better analysis of PhysReg livness - if (!DAG.MRI.isConstantPhysReg(MO.getReg()) && - !DAG.TII->isIgnorableUse(MO)) - return false; - - // Constant PhysRegs and IgnorableUses are okay - continue; - } - - LiveInterval &LI = LIS->getInterval(MO.getReg()); - const VNInfo *OVNI = LI.getVNInfoAt(OriginalIdx); - assert(OVNI); - - // Don't allow rematerialization immediately after the original def. - // It would be incorrect if InstToRemat redefines the register. - // See PR14098. - if (SlotIndex::isSameInstr(OriginalIdx, RematIdx)) - return false; - - if (OVNI != LI.getVNInfoAt(RematIdx)) - return false; - - // Check that subrange is live at RematIdx. - if (LI.hasSubRanges()) { - const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); - unsigned SubReg = MO.getSubReg(); - LaneBitmask LM = SubReg ? TRI->getSubRegIndexLaneMask(SubReg) - : MRI.getMaxLaneMaskForVReg(MO.getReg()); - for (LiveInterval::SubRange &SR : LI.subranges()) { - if ((SR.LaneMask & LM).none()) - continue; - if (!SR.liveAt(RematIdx)) - return false; - - // Early exit if all used lanes are checked. No need to continue. - LM &= ~SR.LaneMask; - if (LM.none()) - break; - } - } - } - return true; -} - bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() { const Function &F = MF.getFunction(); @@ -1812,9 +1755,9 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() { // Do not rematerialize an instruction it it uses registers that aren't // available at its use. This ensures that we are not extending any live // range while rematerializing. - SlotIndex DefIdx = DAG.LIS->getInstructionIndex(DefMI); SlotIndex UseIdx = DAG.LIS->getInstructionIndex(*UseMI).getRegSlot(true); - if (!allUsesAvailableAt(&DefMI, DefIdx, UseIdx)) + if (!VirtRegAuxInfo::allUsesAvailableAt(&DefMI, UseIdx, *DAG.LIS, DAG.MRI, + *DAG.TII)) continue; REMAT_DEBUG(dbgs() << "Region " << I << ": remat instruction " << DefMI); diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index 06b9b64..8ea4267 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -496,12 +496,6 @@ private: /// stage to their pre-stage values. void finalizeGCNSchedStage() override; - /// \p Returns true if all the uses in \p InstToRemat defined at \p - /// OriginalIdx are live at \p RematIdx. This only checks liveness of virtual - /// reg uses. - bool allUsesAvailableAt(const MachineInstr *InstToRemat, - SlotIndex OriginalIdx, SlotIndex RematIdx) const; - public: bool initGCNSchedStage() override; diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 77df721..54f57e0 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -314,9 +314,10 @@ let SubtargetPredicate = HasGFX950Insts, OtherPredicates = [HasBF16ConversionIns defm V_CVT_F32_BF16 : VOP1Inst_t16 <"v_cvt_f32_bf16", VOP_F32_BF16>; } let SubtargetPredicate = isGFX1250Plus, OtherPredicates = [HasBF16ConversionInsts] in { - defm V_CVT_F32_BF16_gfx1250 : VOP1Inst_t16_with_profiles <"v_cvt_f32_bf16_gfx1250", VOP_F32_BF16, - VOPProfile_CVT_F32_BF16_gfx1250_t16, - VOPProfile_CVT_F32_BF16_gfx1250_fake16>; + let True16Predicate = UseRealTrue16Insts in + defm V_CVT_F32_BF16_gfx1250_t16 : VOP1Inst <"V_CVT_F32_BF16_gfx1250_t16", VOPProfile_CVT_F32_BF16_gfx1250_t16>; + let True16Predicate = UseFakeTrue16Insts in + defm V_CVT_F32_BF16_gfx1250_fake16 : VOP1Inst <"V_CVT_F32_BF16_gfx1250_fake16", VOPProfile_CVT_F32_BF16_gfx1250_fake16>; } let ReadsModeReg = 0, mayRaiseFPException = 0 in { @@ -899,6 +900,7 @@ class VOP1_DPP16_Gen<bits<8> op, VOP1_DPP_Pseudo ps, GFXGen Gen, VOPProfile p = let DecoderNamespace = Gen.DecoderNamespace; let OtherPredicates = !listconcat(ps.OtherPredicates, !if(p.HasExt64BitDPP, [HasDPALU_DPP], [])); + let True16Predicate = ps.True16Predicate; } class VOP1_DPP8<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> : @@ -921,6 +923,7 @@ class VOP1_DPP8_Gen<bits<8> op, VOP1_Pseudo ps, GFXGen Gen, VOPProfile p = ps.Pf VOP1_DPP8<op, ps, p> { let AssemblerPredicate = Gen.AssemblerPredicate; let DecoderNamespace = Gen.DecoderNamespace; + let True16Predicate = ps.True16Predicate; } //===----------------------------------------------------------------------===// @@ -1149,7 +1152,7 @@ defm V_TANH_F16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x01f>; defm V_PERMLANE16_SWAP_B32 : VOP1_Real_OpSelIsDPP_gfx1250<0x049>; defm V_TANH_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x04a>; defm V_PRNG_B32 : VOP1_Real_FULL<GFX1250Gen, 0x04b>; -defm V_CVT_F32_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16", "V_CVT_F32_BF16_gfx1250">; +defm V_CVT_F32_BF16_gfx1250 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16">; defm V_SAT_PK4_I4_I8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x073>; defm V_SAT_PK4_U4_U8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x074>; defm V_CVT_PK_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x075>; diff --git a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp index d4d9e54..4105618 100644 --- a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp +++ b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp @@ -46,6 +46,8 @@ private: MachineBasicBlock::iterator &NextMBBI); bool expandCCOp(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, MachineBasicBlock::iterator &NextMBBI); + bool expandCCOpToCMov(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI); bool expandVMSET_VMCLR(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned Opcode); bool expandMV_FPR16INX(MachineBasicBlock &MBB, @@ -178,6 +180,9 @@ bool RISCVExpandPseudo::expandMI(MachineBasicBlock &MBB, bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, MachineBasicBlock::iterator &NextMBBI) { + // First try expanding to a Conditional Move rather than a branch+mv + if (expandCCOpToCMov(MBB, MBBI)) + return true; MachineFunction *MF = MBB.getParent(); MachineInstr &MI = *MBBI; @@ -277,6 +282,86 @@ bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB, return true; } +bool RISCVExpandPseudo::expandCCOpToCMov(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) { + MachineInstr &MI = *MBBI; + DebugLoc DL = MI.getDebugLoc(); + + if (MI.getOpcode() != RISCV::PseudoCCMOVGPR && + MI.getOpcode() != RISCV::PseudoCCMOVGPRNoX0) + return false; + + if (!STI->hasVendorXqcicm()) + return false; + + // FIXME: Would be wonderful to support LHS=X0, but not very easy. + if (MI.getOperand(1).getReg() == RISCV::X0 || + MI.getOperand(4).getReg() == RISCV::X0 || + MI.getOperand(5).getReg() == RISCV::X0) + return false; + + auto CC = static_cast<RISCVCC::CondCode>(MI.getOperand(3).getImm()); + + unsigned CMovOpcode, CMovIOpcode; + switch (CC) { + default: + llvm_unreachable("Unhandled CC"); + case RISCVCC::COND_EQ: + CMovOpcode = RISCV::QC_MVEQ; + CMovIOpcode = RISCV::QC_MVEQI; + break; + case RISCVCC::COND_NE: + CMovOpcode = RISCV::QC_MVNE; + CMovIOpcode = RISCV::QC_MVNEI; + break; + case RISCVCC::COND_LT: + CMovOpcode = RISCV::QC_MVLT; + CMovIOpcode = RISCV::QC_MVLTI; + break; + case RISCVCC::COND_GE: + CMovOpcode = RISCV::QC_MVGE; + CMovIOpcode = RISCV::QC_MVGEI; + break; + case RISCVCC::COND_LTU: + CMovOpcode = RISCV::QC_MVLTU; + CMovIOpcode = RISCV::QC_MVLTUI; + break; + case RISCVCC::COND_GEU: + CMovOpcode = RISCV::QC_MVGEU; + CMovIOpcode = RISCV::QC_MVGEUI; + break; + } + + if (MI.getOperand(2).getReg() == RISCV::X0) { + // $dst = PseudoCCMOVGPR $lhs, X0, $cc, $falsev (=$dst), $truev + // $dst = PseudoCCMOVGPRNoX0 $lhs, X0, $cc, $falsev (=$dst), $truev + // => + // $dst = QC_MVccI $falsev (=$dst), $lhs, 0, $truev + BuildMI(MBB, MBBI, DL, TII->get(CMovIOpcode)) + .addDef(MI.getOperand(0).getReg()) + .addReg(MI.getOperand(4).getReg()) + .addReg(MI.getOperand(1).getReg()) + .addImm(0) + .addReg(MI.getOperand(5).getReg()); + + MI.eraseFromParent(); + return true; + } + + // $dst = PseudoCCMOVGPR $lhs, $rhs, $cc, $falsev (=$dst), $truev + // $dst = PseudoCCMOVGPRNoX0 $lhs, $rhs, $cc, $falsev (=$dst), $truev + // => + // $dst = QC_MVcc $falsev (=$dst), $lhs, $rhs, $truev + BuildMI(MBB, MBBI, DL, TII->get(CMovOpcode)) + .addDef(MI.getOperand(0).getReg()) + .addReg(MI.getOperand(4).getReg()) + .addReg(MI.getOperand(1).getReg()) + .addReg(MI.getOperand(2).getReg()) + .addReg(MI.getOperand(5).getReg()); + MI.eraseFromParent(); + return true; +} + bool RISCVExpandPseudo::expandVMSET_VMCLR(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned Opcode) { diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td index 5407868..efdbd12 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td @@ -1350,6 +1350,10 @@ class QCIMVCCIPat<CondCode Cond, QCIMVCCI Inst, DAGOperand InTyImm> : Pat<(i32 (riscv_selectcc (i32 GPRNoX0:$rs1), InTyImm:$imm, Cond, (i32 GPRNoX0:$rs3), (i32 GPRNoX0:$rd))), (Inst GPRNoX0:$rd, GPRNoX0:$rs1, InTyImm:$imm, GPRNoX0:$rs3)>; +class QCIMVCCIZeroPat<CondCode Cond, QCIMVCCI Inst> + : Pat<(i32 (riscv_selectcc (i32 GPRNoX0:$rs1), (i32 0), Cond, (i32 GPRNoX0:$rs3), (i32 GPRNoX0:$rd))), + (Inst GPRNoX0:$rd, GPRNoX0:$rs1, 0, GPRNoX0:$rs3)>; + class QCISELECTCCIPat<CondCode Cond, QCISELECTCCI Inst> : Pat<(i32 (riscv_selectcc (i32 GPRNoX0:$rd), simm5:$imm, Cond, (i32 GPRNoX0:$rs2), (i32 GPRNoX0:$rs3))), (Inst GPRNoX0:$rd, simm5:$imm, GPRNoX0:$rs2, GPRNoX0:$rs3)>; @@ -1538,14 +1542,7 @@ def: Pat<(i32 (ctlz (not (i32 GPR:$rs1)))), (QC_CLO GPR:$rs1)>; let Predicates = [HasVendorXqciint, IsRV32] in def : Pat<(riscv_mileaveret_glue), (QC_C_MILEAVERET)>; -let Predicates = [HasVendorXqcicm, IsRV32] in { -// (SELECT X, Y, Z) is canonicalised to `(riscv_selectcc x, 0, NE, y, z)`. -// This exists to prioritise over the `Select_GPR_Using_CC_GPR` pattern. -def : Pat<(i32 (riscv_selectcc (i32 GPRNoX0:$rs1), (i32 0), SETNE, (i32 GPRNoX0:$rs3), (i32 GPRNoX0:$rd))), - (QC_MVNEI GPRNoX0:$rd, GPRNoX0:$rs1, 0, GPRNoX0:$rs3)>; -def : Pat<(i32 (riscv_selectcc (i32 GPRNoX0:$rs1), (i32 0), SETEQ, (i32 GPRNoX0:$rs3), (i32 GPRNoX0:$rd))), - (QC_MVEQI GPRNoX0:$rd, GPRNoX0:$rs1, 0, GPRNoX0:$rs3)>; - +let Predicates = [HasVendorXqcicm, NoShortForwardBranchOpt, IsRV32] in { def : QCIMVCCPat<SETEQ, QC_MVEQ>; def : QCIMVCCPat<SETNE, QC_MVNE>; def : QCIMVCCPat<SETLT, QC_MVLT>; @@ -1553,12 +1550,24 @@ def : QCIMVCCPat<SETULT, QC_MVLTU>; def : QCIMVCCPat<SETGE, QC_MVGE>; def : QCIMVCCPat<SETUGE, QC_MVGEU>; -def : QCIMVCCIPat<SETEQ, QC_MVEQI, simm5>; -def : QCIMVCCIPat<SETNE, QC_MVNEI, simm5>; -def : QCIMVCCIPat<SETLT, QC_MVLTI, simm5>; -def : QCIMVCCIPat<SETULT, QC_MVLTUI, uimm5>; -def : QCIMVCCIPat<SETGE, QC_MVGEI, simm5>; -def : QCIMVCCIPat<SETUGE, QC_MVGEUI, uimm5>; +// These exist to prioritise over the `Select_GPR_Using_CC_GPR` pattern for X0. +def : QCIMVCCIZeroPat<SETEQ, QC_MVEQI>; +def : QCIMVCCIZeroPat<SETNE, QC_MVNEI>; +def : QCIMVCCIZeroPat<SETLT, QC_MVLTI>; +def : QCIMVCCIZeroPat<SETULT, QC_MVLTUI>; +def : QCIMVCCIZeroPat<SETGE, QC_MVGEI>; +def : QCIMVCCIZeroPat<SETUGE, QC_MVGEUI>; +} + +let Predicates = [HasVendorXqcicm, IsRV32] in { +// These all use *imm5nonzero because we want to use PseudoCCMOVGPR with X0 when SFB is enabled. +// When SFB is not enabled, the `QCIMVCCIZeroPat`s above will be used if RHS=0. +def : QCIMVCCIPat<SETEQ, QC_MVEQI, simm5nonzero>; +def : QCIMVCCIPat<SETNE, QC_MVNEI, simm5nonzero>; +def : QCIMVCCIPat<SETLT, QC_MVLTI, simm5nonzero>; +def : QCIMVCCIPat<SETULT, QC_MVLTUI, uimm5nonzero>; +def : QCIMVCCIPat<SETGE, QC_MVGEI, simm5nonzero>; +def : QCIMVCCIPat<SETUGE, QC_MVGEUI, uimm5nonzero>; } let Predicates = [HasVendorXqcicli, IsRV32] in { diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index d4124ae..ee25f69 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -3139,8 +3139,8 @@ bool RISCVTTIImpl::isProfitableToSinkOperands( bool IsVPSplat = match(Op, m_Intrinsic<Intrinsic::experimental_vp_splat>( m_Value(), m_Value(), m_Value())); if (!IsVPSplat && - !match(Op, m_Shuffle(m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()), - m_Undef(), m_ZeroMask()))) + !match(Op, m_Shuffle(m_InsertElt(m_Value(), m_Value(), m_ZeroInt()), + m_Value(), m_ZeroMask()))) continue; // Don't sink i1 splats. diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index cd04ff5..34854e4 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -44615,8 +44615,11 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( APInt DemandedMask = OriginalDemandedBits << ShAmt; - // If we just want the sign bit then we don't need to shift it. - if (OriginalDemandedBits.isSignMask()) + // If we only want bits that already match the signbit then we don't need + // to shift. + unsigned NumHiDemandedBits = BitWidth - OriginalDemandedBits.countr_zero(); + if (TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1) >= + NumHiDemandedBits) return TLO.CombineTo(Op, Op0); // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1 @@ -45169,6 +45172,18 @@ bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode( case X86ISD::Wrapper: case X86ISD::WrapperRIP: return true; + case X86ISD::PACKSS: + case X86ISD::PACKUS: { + APInt DemandedLHS, DemandedRHS; + getPackDemandedElts(Op.getSimpleValueType(), DemandedElts, DemandedLHS, + DemandedRHS); + return (!DemandedLHS || + DAG.isGuaranteedNotToBeUndefOrPoison(Op.getOperand(0), DemandedLHS, + PoisonOnly, Depth + 1)) && + (!DemandedRHS || + DAG.isGuaranteedNotToBeUndefOrPoison(Op.getOperand(1), DemandedRHS, + PoisonOnly, Depth + 1)); + } case X86ISD::INSERTPS: case X86ISD::BLENDI: case X86ISD::PSHUFB: @@ -45239,6 +45254,10 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode( case X86ISD::BLENDI: case X86ISD::BLENDV: return false; + // SSE packs. + case X86ISD::PACKSS: + case X86ISD::PACKUS: + return false; // SSE target shuffles. case X86ISD::INSERTPS: case X86ISD::PSHUFB: |