aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td62
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp63
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.h6
-rw-r--r--llvm/lib/Target/AMDGPU/VOP1Instructions.td11
-rw-r--r--llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp85
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td37
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp4
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp23
8 files changed, 169 insertions, 122 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 1e30735..36c9cb6 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -707,16 +707,14 @@ let Predicates = [HasSVE_or_SME] in {
defm SDOT_ZZZ : sve_intx_dot<0b0, "sdot", AArch64sdot>;
defm UDOT_ZZZ : sve_intx_dot<0b1, "udot", AArch64udot>;
- let Predicates = [HasSVE_or_SME] in {
- def : Pat<(nxv4i32 (partial_reduce_umla nxv4i32:$Acc, nxv16i8:$MulLHS, nxv16i8:$MulRHS)),
- (UDOT_ZZZ_BtoS $Acc, $MulLHS, $MulRHS)>;
- def : Pat<(nxv4i32 (partial_reduce_smla nxv4i32:$Acc, nxv16i8:$MulLHS, nxv16i8:$MulRHS)),
- (SDOT_ZZZ_BtoS $Acc, $MulLHS, $MulRHS)>;
- def : Pat<(nxv2i64 (partial_reduce_umla nxv2i64:$Acc, nxv8i16:$MulLHS, nxv8i16:$MulRHS)),
- (UDOT_ZZZ_HtoD $Acc, $MulLHS, $MulRHS)>;
- def : Pat<(nxv2i64 (partial_reduce_smla nxv2i64:$Acc, nxv8i16:$MulLHS, nxv8i16:$MulRHS)),
- (SDOT_ZZZ_HtoD $Acc, $MulLHS, $MulRHS)>;
- } // End HasSVE_or_SME
+ def : Pat<(nxv4i32 (partial_reduce_umla nxv4i32:$Acc, nxv16i8:$MulLHS, nxv16i8:$MulRHS)),
+ (UDOT_ZZZ_BtoS $Acc, $MulLHS, $MulRHS)>;
+ def : Pat<(nxv4i32 (partial_reduce_smla nxv4i32:$Acc, nxv16i8:$MulLHS, nxv16i8:$MulRHS)),
+ (SDOT_ZZZ_BtoS $Acc, $MulLHS, $MulRHS)>;
+ def : Pat<(nxv2i64 (partial_reduce_umla nxv2i64:$Acc, nxv8i16:$MulLHS, nxv8i16:$MulRHS)),
+ (UDOT_ZZZ_HtoD $Acc, $MulLHS, $MulRHS)>;
+ def : Pat<(nxv2i64 (partial_reduce_smla nxv2i64:$Acc, nxv8i16:$MulLHS, nxv8i16:$MulRHS)),
+ (SDOT_ZZZ_HtoD $Acc, $MulLHS, $MulRHS)>;
defm SDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b0, "sdot", int_aarch64_sve_sdot_lane>;
defm UDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b1, "udot", int_aarch64_sve_udot_lane>;
@@ -3646,6 +3644,9 @@ let Predicates = [HasSVE_or_SME, HasMatMulInt8] in {
defm USDOT_ZZZ : sve_int_dot_mixed<"usdot", AArch64usdot>;
defm USDOT_ZZZI : sve_int_dot_mixed_indexed<0, "usdot", int_aarch64_sve_usdot_lane>;
defm SUDOT_ZZZI : sve_int_dot_mixed_indexed<1, "sudot", int_aarch64_sve_sudot_lane>;
+
+ def : Pat<(nxv4i32 (partial_reduce_sumla nxv4i32:$Acc, nxv16i8:$LHS, nxv16i8:$RHS)),
+ (USDOT_ZZZ $Acc, $RHS, $LHS)>;
} // End HasSVE_or_SME, HasMatMulInt8
let Predicates = [HasSVE, HasMatMulFP32] in {
@@ -3752,6 +3753,19 @@ let Predicates = [HasSVE2_or_SME] in {
defm UMLSLB_ZZZ : sve2_int_mla_long<0b10110, "umlslb", int_aarch64_sve_umlslb>;
defm UMLSLT_ZZZ : sve2_int_mla_long<0b10111, "umlslt", int_aarch64_sve_umlslt>;
+ def : Pat<(nxv2i64 (partial_reduce_umla nxv2i64:$Acc, nxv4i32:$LHS, nxv4i32:$RHS)),
+ (UMLALT_ZZZ_D (UMLALB_ZZZ_D $Acc, $LHS, $RHS), $LHS, $RHS)>;
+ def : Pat<(nxv2i64 (partial_reduce_smla nxv2i64:$Acc, nxv4i32:$LHS, nxv4i32:$RHS)),
+ (SMLALT_ZZZ_D (SMLALB_ZZZ_D $Acc, $LHS, $RHS), $LHS, $RHS)>;
+ def : Pat<(nxv4i32 (partial_reduce_umla nxv4i32:$Acc, nxv8i16:$LHS, nxv8i16:$RHS)),
+ (UMLALT_ZZZ_S (UMLALB_ZZZ_S $Acc, $LHS, $RHS), $LHS, $RHS)>;
+ def : Pat<(nxv4i32 (partial_reduce_smla nxv4i32:$Acc, nxv8i16:$LHS, nxv8i16:$RHS)),
+ (SMLALT_ZZZ_S (SMLALB_ZZZ_S $Acc, $LHS, $RHS), $LHS, $RHS)>;
+ def : Pat<(nxv8i16 (partial_reduce_umla nxv8i16:$Acc, nxv16i8:$LHS, nxv16i8:$RHS)),
+ (UMLALT_ZZZ_H (UMLALB_ZZZ_H $Acc, $LHS, $RHS), $LHS, $RHS)>;
+ def : Pat<(nxv8i16 (partial_reduce_smla nxv8i16:$Acc, nxv16i8:$LHS, nxv16i8:$RHS)),
+ (SMLALT_ZZZ_H (SMLALB_ZZZ_H $Acc, $LHS, $RHS), $LHS, $RHS)>;
+
// SVE2 saturating multiply-add long (indexed)
defm SQDMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0100, "sqdmlalb", int_aarch64_sve_sqdmlalb_lane>;
defm SQDMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0101, "sqdmlalt", int_aarch64_sve_sqdmlalt_lane>;
@@ -3880,19 +3894,6 @@ let Predicates = [HasSVE2_or_SME] in {
def : Pat<(nxv8i16 (partial_reduce_smla nxv8i16:$Acc, nxv16i8:$Input, (nxv16i8 (splat_vector (i32 1))))),
(SADDWT_ZZZ_H (SADDWB_ZZZ_H $Acc, $Input), $Input)>;
- def : Pat<(nxv2i64 (partial_reduce_umla nxv2i64:$Acc, nxv4i32:$LHS, nxv4i32:$RHS)),
- (UMLALT_ZZZ_D (UMLALB_ZZZ_D $Acc, $LHS, $RHS), $LHS, $RHS)>;
- def : Pat<(nxv2i64 (partial_reduce_smla nxv2i64:$Acc, nxv4i32:$LHS, nxv4i32:$RHS)),
- (SMLALT_ZZZ_D (SMLALB_ZZZ_D $Acc, $LHS, $RHS), $LHS, $RHS)>;
- def : Pat<(nxv4i32 (partial_reduce_umla nxv4i32:$Acc, nxv8i16:$LHS, nxv8i16:$RHS)),
- (UMLALT_ZZZ_S (UMLALB_ZZZ_S $Acc, $LHS, $RHS), $LHS, $RHS)>;
- def : Pat<(nxv4i32 (partial_reduce_smla nxv4i32:$Acc, nxv8i16:$LHS, nxv8i16:$RHS)),
- (SMLALT_ZZZ_S (SMLALB_ZZZ_S $Acc, $LHS, $RHS), $LHS, $RHS)>;
- def : Pat<(nxv8i16 (partial_reduce_umla nxv8i16:$Acc, nxv16i8:$LHS, nxv16i8:$RHS)),
- (UMLALT_ZZZ_H (UMLALB_ZZZ_H $Acc, $LHS, $RHS), $LHS, $RHS)>;
- def : Pat<(nxv8i16 (partial_reduce_smla nxv8i16:$Acc, nxv16i8:$LHS, nxv16i8:$RHS)),
- (SMLALT_ZZZ_H (SMLALB_ZZZ_H $Acc, $LHS, $RHS), $LHS, $RHS)>;
-
// SVE2 integer multiply long
defm SQDMULLB_ZZZ : sve2_wide_int_arith_long<0b11000, "sqdmullb", int_aarch64_sve_sqdmullb>;
defm SQDMULLT_ZZZ : sve2_wide_int_arith_long<0b11001, "sqdmullt", int_aarch64_sve_sqdmullt>;
@@ -4200,11 +4201,6 @@ let Predicates = [HasSVEAES2, HasNonStreamingSVE_or_SSVE_AES] in {
def PMULL_2ZZZ_Q : sve_crypto_pmull_multi<"pmull">;
}
-let Predicates = [HasSVE_or_SME, HasMatMulInt8] in {
- def : Pat<(nxv4i32 (partial_reduce_sumla nxv4i32:$Acc, nxv16i8:$LHS, nxv16i8:$RHS)),
- (USDOT_ZZZ $Acc, $RHS, $LHS)>;
- } // End HasSVE_or_SME, HasMatMulInt8
-
//===----------------------------------------------------------------------===//
// SME or SVE2.1 instructions
//===----------------------------------------------------------------------===//
@@ -4238,12 +4234,10 @@ defm UDOT_ZZZ_HtoS : sve2p1_two_way_dot_vv<"udot", 0b1, int_aarch64_sve_udot_x2
defm SDOT_ZZZI_HtoS : sve2p1_two_way_dot_vvi<"sdot", 0b0, int_aarch64_sve_sdot_lane_x2>;
defm UDOT_ZZZI_HtoS : sve2p1_two_way_dot_vvi<"udot", 0b1, int_aarch64_sve_udot_lane_x2>;
-let Predicates = [HasSVE2p1_or_SME2] in {
- def : Pat<(nxv4i32 (partial_reduce_umla nxv4i32:$Acc, nxv8i16:$MulLHS, nxv8i16:$MulRHS)),
- (UDOT_ZZZ_HtoS $Acc, $MulLHS, $MulRHS)>;
- def : Pat<(nxv4i32 (partial_reduce_smla nxv4i32:$Acc, nxv8i16:$MulLHS, nxv8i16:$MulRHS)),
- (SDOT_ZZZ_HtoS $Acc, $MulLHS, $MulRHS)>;
-} // End HasSVE2p1_or_SME2
+def : Pat<(nxv4i32 (partial_reduce_umla nxv4i32:$Acc, nxv8i16:$MulLHS, nxv8i16:$MulRHS)),
+ (UDOT_ZZZ_HtoS $Acc, $MulLHS, $MulRHS)>;
+def : Pat<(nxv4i32 (partial_reduce_smla nxv4i32:$Acc, nxv8i16:$MulLHS, nxv8i16:$MulRHS)),
+ (SDOT_ZZZ_HtoS $Acc, $MulLHS, $MulRHS)>;
defm SQCVTN_Z2Z_StoH : sve2p1_multi_vec_extract_narrow<"sqcvtn", 0b00, int_aarch64_sve_sqcvtn_x2>;
defm UQCVTN_Z2Z_StoH : sve2p1_multi_vec_extract_narrow<"uqcvtn", 0b01, int_aarch64_sve_uqcvtn_x2>;
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index fab78a9..bdc0810 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -29,6 +29,7 @@
#include "SIMachineFunctionInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/CalcSpillWeights.h"
#include "llvm/CodeGen/RegisterClassInfo.h"
#include "llvm/MC/LaneBitmask.h"
#include "llvm/Support/ErrorHandling.h"
@@ -1633,64 +1634,6 @@ void GCNSchedStage::revertScheduling() {
DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
}
-bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat,
- SlotIndex OriginalIdx,
- SlotIndex RematIdx) const {
-
- LiveIntervals *LIS = DAG.LIS;
- MachineRegisterInfo &MRI = DAG.MRI;
- OriginalIdx = OriginalIdx.getRegSlot(true);
- RematIdx = std::max(RematIdx, RematIdx.getRegSlot(true));
- for (const MachineOperand &MO : InstToRemat->operands()) {
- if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
- continue;
-
- if (!MO.getReg().isVirtual()) {
- // Do not attempt to reason about PhysRegs
- // TODO: better analysis of PhysReg livness
- if (!DAG.MRI.isConstantPhysReg(MO.getReg()) &&
- !DAG.TII->isIgnorableUse(MO))
- return false;
-
- // Constant PhysRegs and IgnorableUses are okay
- continue;
- }
-
- LiveInterval &LI = LIS->getInterval(MO.getReg());
- const VNInfo *OVNI = LI.getVNInfoAt(OriginalIdx);
- assert(OVNI);
-
- // Don't allow rematerialization immediately after the original def.
- // It would be incorrect if InstToRemat redefines the register.
- // See PR14098.
- if (SlotIndex::isSameInstr(OriginalIdx, RematIdx))
- return false;
-
- if (OVNI != LI.getVNInfoAt(RematIdx))
- return false;
-
- // Check that subrange is live at RematIdx.
- if (LI.hasSubRanges()) {
- const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
- unsigned SubReg = MO.getSubReg();
- LaneBitmask LM = SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
- : MRI.getMaxLaneMaskForVReg(MO.getReg());
- for (LiveInterval::SubRange &SR : LI.subranges()) {
- if ((SR.LaneMask & LM).none())
- continue;
- if (!SR.liveAt(RematIdx))
- return false;
-
- // Early exit if all used lanes are checked. No need to continue.
- LM &= ~SR.LaneMask;
- if (LM.none())
- break;
- }
- }
- }
- return true;
-}
-
bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
const Function &F = MF.getFunction();
@@ -1812,9 +1755,9 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
// Do not rematerialize an instruction it it uses registers that aren't
// available at its use. This ensures that we are not extending any live
// range while rematerializing.
- SlotIndex DefIdx = DAG.LIS->getInstructionIndex(DefMI);
SlotIndex UseIdx = DAG.LIS->getInstructionIndex(*UseMI).getRegSlot(true);
- if (!allUsesAvailableAt(&DefMI, DefIdx, UseIdx))
+ if (!VirtRegAuxInfo::allUsesAvailableAt(&DefMI, UseIdx, *DAG.LIS, DAG.MRI,
+ *DAG.TII))
continue;
REMAT_DEBUG(dbgs() << "Region " << I << ": remat instruction " << DefMI);
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 06b9b64..8ea4267 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -496,12 +496,6 @@ private:
/// stage to their pre-stage values.
void finalizeGCNSchedStage() override;
- /// \p Returns true if all the uses in \p InstToRemat defined at \p
- /// OriginalIdx are live at \p RematIdx. This only checks liveness of virtual
- /// reg uses.
- bool allUsesAvailableAt(const MachineInstr *InstToRemat,
- SlotIndex OriginalIdx, SlotIndex RematIdx) const;
-
public:
bool initGCNSchedStage() override;
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 77df721..54f57e0 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -314,9 +314,10 @@ let SubtargetPredicate = HasGFX950Insts, OtherPredicates = [HasBF16ConversionIns
defm V_CVT_F32_BF16 : VOP1Inst_t16 <"v_cvt_f32_bf16", VOP_F32_BF16>;
}
let SubtargetPredicate = isGFX1250Plus, OtherPredicates = [HasBF16ConversionInsts] in {
- defm V_CVT_F32_BF16_gfx1250 : VOP1Inst_t16_with_profiles <"v_cvt_f32_bf16_gfx1250", VOP_F32_BF16,
- VOPProfile_CVT_F32_BF16_gfx1250_t16,
- VOPProfile_CVT_F32_BF16_gfx1250_fake16>;
+ let True16Predicate = UseRealTrue16Insts in
+ defm V_CVT_F32_BF16_gfx1250_t16 : VOP1Inst <"V_CVT_F32_BF16_gfx1250_t16", VOPProfile_CVT_F32_BF16_gfx1250_t16>;
+ let True16Predicate = UseFakeTrue16Insts in
+ defm V_CVT_F32_BF16_gfx1250_fake16 : VOP1Inst <"V_CVT_F32_BF16_gfx1250_fake16", VOPProfile_CVT_F32_BF16_gfx1250_fake16>;
}
let ReadsModeReg = 0, mayRaiseFPException = 0 in {
@@ -899,6 +900,7 @@ class VOP1_DPP16_Gen<bits<8> op, VOP1_DPP_Pseudo ps, GFXGen Gen, VOPProfile p =
let DecoderNamespace = Gen.DecoderNamespace;
let OtherPredicates = !listconcat(ps.OtherPredicates,
!if(p.HasExt64BitDPP, [HasDPALU_DPP], []));
+ let True16Predicate = ps.True16Predicate;
}
class VOP1_DPP8<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> :
@@ -921,6 +923,7 @@ class VOP1_DPP8_Gen<bits<8> op, VOP1_Pseudo ps, GFXGen Gen, VOPProfile p = ps.Pf
VOP1_DPP8<op, ps, p> {
let AssemblerPredicate = Gen.AssemblerPredicate;
let DecoderNamespace = Gen.DecoderNamespace;
+ let True16Predicate = ps.True16Predicate;
}
//===----------------------------------------------------------------------===//
@@ -1149,7 +1152,7 @@ defm V_TANH_F16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x01f>;
defm V_PERMLANE16_SWAP_B32 : VOP1_Real_OpSelIsDPP_gfx1250<0x049>;
defm V_TANH_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x04a>;
defm V_PRNG_B32 : VOP1_Real_FULL<GFX1250Gen, 0x04b>;
-defm V_CVT_F32_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16", "V_CVT_F32_BF16_gfx1250">;
+defm V_CVT_F32_BF16_gfx1250 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16">;
defm V_SAT_PK4_I4_I8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x073>;
defm V_SAT_PK4_U4_U8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x074>;
defm V_CVT_PK_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x075>;
diff --git a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
index d4d9e54..4105618 100644
--- a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
@@ -46,6 +46,8 @@ private:
MachineBasicBlock::iterator &NextMBBI);
bool expandCCOp(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
MachineBasicBlock::iterator &NextMBBI);
+ bool expandCCOpToCMov(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI);
bool expandVMSET_VMCLR(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI, unsigned Opcode);
bool expandMV_FPR16INX(MachineBasicBlock &MBB,
@@ -178,6 +180,9 @@ bool RISCVExpandPseudo::expandMI(MachineBasicBlock &MBB,
bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
MachineBasicBlock::iterator &NextMBBI) {
+ // First try expanding to a Conditional Move rather than a branch+mv
+ if (expandCCOpToCMov(MBB, MBBI))
+ return true;
MachineFunction *MF = MBB.getParent();
MachineInstr &MI = *MBBI;
@@ -277,6 +282,86 @@ bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB,
return true;
}
+bool RISCVExpandPseudo::expandCCOpToCMov(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI) {
+ MachineInstr &MI = *MBBI;
+ DebugLoc DL = MI.getDebugLoc();
+
+ if (MI.getOpcode() != RISCV::PseudoCCMOVGPR &&
+ MI.getOpcode() != RISCV::PseudoCCMOVGPRNoX0)
+ return false;
+
+ if (!STI->hasVendorXqcicm())
+ return false;
+
+ // FIXME: Would be wonderful to support LHS=X0, but not very easy.
+ if (MI.getOperand(1).getReg() == RISCV::X0 ||
+ MI.getOperand(4).getReg() == RISCV::X0 ||
+ MI.getOperand(5).getReg() == RISCV::X0)
+ return false;
+
+ auto CC = static_cast<RISCVCC::CondCode>(MI.getOperand(3).getImm());
+
+ unsigned CMovOpcode, CMovIOpcode;
+ switch (CC) {
+ default:
+ llvm_unreachable("Unhandled CC");
+ case RISCVCC::COND_EQ:
+ CMovOpcode = RISCV::QC_MVEQ;
+ CMovIOpcode = RISCV::QC_MVEQI;
+ break;
+ case RISCVCC::COND_NE:
+ CMovOpcode = RISCV::QC_MVNE;
+ CMovIOpcode = RISCV::QC_MVNEI;
+ break;
+ case RISCVCC::COND_LT:
+ CMovOpcode = RISCV::QC_MVLT;
+ CMovIOpcode = RISCV::QC_MVLTI;
+ break;
+ case RISCVCC::COND_GE:
+ CMovOpcode = RISCV::QC_MVGE;
+ CMovIOpcode = RISCV::QC_MVGEI;
+ break;
+ case RISCVCC::COND_LTU:
+ CMovOpcode = RISCV::QC_MVLTU;
+ CMovIOpcode = RISCV::QC_MVLTUI;
+ break;
+ case RISCVCC::COND_GEU:
+ CMovOpcode = RISCV::QC_MVGEU;
+ CMovIOpcode = RISCV::QC_MVGEUI;
+ break;
+ }
+
+ if (MI.getOperand(2).getReg() == RISCV::X0) {
+ // $dst = PseudoCCMOVGPR $lhs, X0, $cc, $falsev (=$dst), $truev
+ // $dst = PseudoCCMOVGPRNoX0 $lhs, X0, $cc, $falsev (=$dst), $truev
+ // =>
+ // $dst = QC_MVccI $falsev (=$dst), $lhs, 0, $truev
+ BuildMI(MBB, MBBI, DL, TII->get(CMovIOpcode))
+ .addDef(MI.getOperand(0).getReg())
+ .addReg(MI.getOperand(4).getReg())
+ .addReg(MI.getOperand(1).getReg())
+ .addImm(0)
+ .addReg(MI.getOperand(5).getReg());
+
+ MI.eraseFromParent();
+ return true;
+ }
+
+ // $dst = PseudoCCMOVGPR $lhs, $rhs, $cc, $falsev (=$dst), $truev
+ // $dst = PseudoCCMOVGPRNoX0 $lhs, $rhs, $cc, $falsev (=$dst), $truev
+ // =>
+ // $dst = QC_MVcc $falsev (=$dst), $lhs, $rhs, $truev
+ BuildMI(MBB, MBBI, DL, TII->get(CMovOpcode))
+ .addDef(MI.getOperand(0).getReg())
+ .addReg(MI.getOperand(4).getReg())
+ .addReg(MI.getOperand(1).getReg())
+ .addReg(MI.getOperand(2).getReg())
+ .addReg(MI.getOperand(5).getReg());
+ MI.eraseFromParent();
+ return true;
+}
+
bool RISCVExpandPseudo::expandVMSET_VMCLR(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
unsigned Opcode) {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index 5407868..efdbd12 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -1350,6 +1350,10 @@ class QCIMVCCIPat<CondCode Cond, QCIMVCCI Inst, DAGOperand InTyImm>
: Pat<(i32 (riscv_selectcc (i32 GPRNoX0:$rs1), InTyImm:$imm, Cond, (i32 GPRNoX0:$rs3), (i32 GPRNoX0:$rd))),
(Inst GPRNoX0:$rd, GPRNoX0:$rs1, InTyImm:$imm, GPRNoX0:$rs3)>;
+class QCIMVCCIZeroPat<CondCode Cond, QCIMVCCI Inst>
+ : Pat<(i32 (riscv_selectcc (i32 GPRNoX0:$rs1), (i32 0), Cond, (i32 GPRNoX0:$rs3), (i32 GPRNoX0:$rd))),
+ (Inst GPRNoX0:$rd, GPRNoX0:$rs1, 0, GPRNoX0:$rs3)>;
+
class QCISELECTCCIPat<CondCode Cond, QCISELECTCCI Inst>
: Pat<(i32 (riscv_selectcc (i32 GPRNoX0:$rd), simm5:$imm, Cond, (i32 GPRNoX0:$rs2), (i32 GPRNoX0:$rs3))),
(Inst GPRNoX0:$rd, simm5:$imm, GPRNoX0:$rs2, GPRNoX0:$rs3)>;
@@ -1538,14 +1542,7 @@ def: Pat<(i32 (ctlz (not (i32 GPR:$rs1)))), (QC_CLO GPR:$rs1)>;
let Predicates = [HasVendorXqciint, IsRV32] in
def : Pat<(riscv_mileaveret_glue), (QC_C_MILEAVERET)>;
-let Predicates = [HasVendorXqcicm, IsRV32] in {
-// (SELECT X, Y, Z) is canonicalised to `(riscv_selectcc x, 0, NE, y, z)`.
-// This exists to prioritise over the `Select_GPR_Using_CC_GPR` pattern.
-def : Pat<(i32 (riscv_selectcc (i32 GPRNoX0:$rs1), (i32 0), SETNE, (i32 GPRNoX0:$rs3), (i32 GPRNoX0:$rd))),
- (QC_MVNEI GPRNoX0:$rd, GPRNoX0:$rs1, 0, GPRNoX0:$rs3)>;
-def : Pat<(i32 (riscv_selectcc (i32 GPRNoX0:$rs1), (i32 0), SETEQ, (i32 GPRNoX0:$rs3), (i32 GPRNoX0:$rd))),
- (QC_MVEQI GPRNoX0:$rd, GPRNoX0:$rs1, 0, GPRNoX0:$rs3)>;
-
+let Predicates = [HasVendorXqcicm, NoShortForwardBranchOpt, IsRV32] in {
def : QCIMVCCPat<SETEQ, QC_MVEQ>;
def : QCIMVCCPat<SETNE, QC_MVNE>;
def : QCIMVCCPat<SETLT, QC_MVLT>;
@@ -1553,12 +1550,24 @@ def : QCIMVCCPat<SETULT, QC_MVLTU>;
def : QCIMVCCPat<SETGE, QC_MVGE>;
def : QCIMVCCPat<SETUGE, QC_MVGEU>;
-def : QCIMVCCIPat<SETEQ, QC_MVEQI, simm5>;
-def : QCIMVCCIPat<SETNE, QC_MVNEI, simm5>;
-def : QCIMVCCIPat<SETLT, QC_MVLTI, simm5>;
-def : QCIMVCCIPat<SETULT, QC_MVLTUI, uimm5>;
-def : QCIMVCCIPat<SETGE, QC_MVGEI, simm5>;
-def : QCIMVCCIPat<SETUGE, QC_MVGEUI, uimm5>;
+// These exist to prioritise over the `Select_GPR_Using_CC_GPR` pattern for X0.
+def : QCIMVCCIZeroPat<SETEQ, QC_MVEQI>;
+def : QCIMVCCIZeroPat<SETNE, QC_MVNEI>;
+def : QCIMVCCIZeroPat<SETLT, QC_MVLTI>;
+def : QCIMVCCIZeroPat<SETULT, QC_MVLTUI>;
+def : QCIMVCCIZeroPat<SETGE, QC_MVGEI>;
+def : QCIMVCCIZeroPat<SETUGE, QC_MVGEUI>;
+}
+
+let Predicates = [HasVendorXqcicm, IsRV32] in {
+// These all use *imm5nonzero because we want to use PseudoCCMOVGPR with X0 when SFB is enabled.
+// When SFB is not enabled, the `QCIMVCCIZeroPat`s above will be used if RHS=0.
+def : QCIMVCCIPat<SETEQ, QC_MVEQI, simm5nonzero>;
+def : QCIMVCCIPat<SETNE, QC_MVNEI, simm5nonzero>;
+def : QCIMVCCIPat<SETLT, QC_MVLTI, simm5nonzero>;
+def : QCIMVCCIPat<SETULT, QC_MVLTUI, uimm5nonzero>;
+def : QCIMVCCIPat<SETGE, QC_MVGEI, simm5nonzero>;
+def : QCIMVCCIPat<SETUGE, QC_MVGEUI, uimm5nonzero>;
}
let Predicates = [HasVendorXqcicli, IsRV32] in {
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index d4124ae..ee25f69 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -3139,8 +3139,8 @@ bool RISCVTTIImpl::isProfitableToSinkOperands(
bool IsVPSplat = match(Op, m_Intrinsic<Intrinsic::experimental_vp_splat>(
m_Value(), m_Value(), m_Value()));
if (!IsVPSplat &&
- !match(Op, m_Shuffle(m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()),
- m_Undef(), m_ZeroMask())))
+ !match(Op, m_Shuffle(m_InsertElt(m_Value(), m_Value(), m_ZeroInt()),
+ m_Value(), m_ZeroMask())))
continue;
// Don't sink i1 splats.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index cd04ff5..34854e4 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -44615,8 +44615,11 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
APInt DemandedMask = OriginalDemandedBits << ShAmt;
- // If we just want the sign bit then we don't need to shift it.
- if (OriginalDemandedBits.isSignMask())
+ // If we only want bits that already match the signbit then we don't need
+ // to shift.
+ unsigned NumHiDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();
+ if (TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1) >=
+ NumHiDemandedBits)
return TLO.CombineTo(Op, Op0);
// fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
@@ -45169,6 +45172,18 @@ bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
case X86ISD::Wrapper:
case X86ISD::WrapperRIP:
return true;
+ case X86ISD::PACKSS:
+ case X86ISD::PACKUS: {
+ APInt DemandedLHS, DemandedRHS;
+ getPackDemandedElts(Op.getSimpleValueType(), DemandedElts, DemandedLHS,
+ DemandedRHS);
+ return (!DemandedLHS ||
+ DAG.isGuaranteedNotToBeUndefOrPoison(Op.getOperand(0), DemandedLHS,
+ PoisonOnly, Depth + 1)) &&
+ (!DemandedRHS ||
+ DAG.isGuaranteedNotToBeUndefOrPoison(Op.getOperand(1), DemandedRHS,
+ PoisonOnly, Depth + 1));
+ }
case X86ISD::INSERTPS:
case X86ISD::BLENDI:
case X86ISD::PSHUFB:
@@ -45239,6 +45254,10 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(
case X86ISD::BLENDI:
case X86ISD::BLENDV:
return false;
+ // SSE packs.
+ case X86ISD::PACKSS:
+ case X86ISD::PACKUS:
+ return false;
// SSE target shuffles.
case X86ISD::INSERTPS:
case X86ISD::PSHUFB: