aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp66
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.td14
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp10
-rw-r--r--llvm/lib/Target/AArch64/SMEInstrFormats.td27
-rw-r--r--llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp82
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp25
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp59
-rw-r--r--llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td27
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h3
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp54
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXInstrInfo.td1
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXIntrinsics.td41
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXSubtarget.h15
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp3
-rw-r--r--llvm/lib/Target/PowerPC/PPCRegisterInfo.h5
-rw-r--r--llvm/lib/Target/RISCV/RISCVCallingConv.td2
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp81
-rw-r--r--llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp5
-rw-r--r--llvm/lib/Target/RISCV/RISCVRegisterInfo.h2
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp38
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h6
-rw-r--r--llvm/lib/Target/X86/X86ExpandPseudo.cpp4
-rw-r--r--llvm/lib/Target/X86/X86FrameLowering.cpp121
-rw-r--r--llvm/lib/Target/X86/X86FrameLowering.h50
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp19
26 files changed, 533 insertions, 231 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 84f6d42..8617377 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -8787,51 +8787,6 @@ static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
return ZExtBool;
}
-// The FORM_TRANSPOSED_REG_TUPLE pseudo should only be used if the
-// input operands are copy nodes where the source register is in a
-// StridedOrContiguous class. For example:
-//
-// %3:zpr2stridedorcontiguous = LD1B_2Z_IMM_PSEUDO ..
-// %4:zpr = COPY %3.zsub1:zpr2stridedorcontiguous
-// %5:zpr = COPY %3.zsub0:zpr2stridedorcontiguous
-// %6:zpr2stridedorcontiguous = LD1B_2Z_PSEUDO ..
-// %7:zpr = COPY %6.zsub1:zpr2stridedorcontiguous
-// %8:zpr = COPY %6.zsub0:zpr2stridedorcontiguous
-// %9:zpr2mul2 = FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO %5:zpr, %8:zpr
-//
-bool shouldUseFormStridedPseudo(MachineInstr &MI) {
- MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
-
- assert((MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO ||
- MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO) &&
- "Unexpected opcode.");
-
- MCRegister SubReg = MCRegister::NoRegister;
- for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
- MachineOperand &MO = MI.getOperand(I);
- assert(MO.isReg() && "Unexpected operand to FORM_TRANSPOSED_REG_TUPLE");
-
- MachineOperand *Def = MRI.getOneDef(MO.getReg());
- if (!Def || !Def->getParent()->isCopy())
- return false;
-
- const MachineOperand &CopySrc = Def->getParent()->getOperand(1);
- unsigned OpSubReg = CopySrc.getSubReg();
- if (SubReg == MCRegister::NoRegister)
- SubReg = OpSubReg;
-
- MachineOperand *CopySrcOp = MRI.getOneDef(CopySrc.getReg());
- const TargetRegisterClass *CopySrcClass =
- MRI.getRegClass(CopySrcOp->getReg());
- if (!CopySrcOp || !CopySrcOp->isReg() || OpSubReg != SubReg ||
- (CopySrcClass != &AArch64::ZPR2StridedOrContiguousRegClass &&
- CopySrcClass != &AArch64::ZPR4StridedOrContiguousRegClass))
- return false;
- }
-
- return true;
-}
-
void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
SDNode *Node) const {
// Live-in physreg copies that are glued to SMSTART are applied as
@@ -8857,27 +8812,6 @@ void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
}
}
- if (MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO ||
- MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO) {
- // If input values to the FORM_TRANSPOSED_REG_TUPLE pseudo aren't copies
- // from a StridedOrContiguous class, fall back on REG_SEQUENCE node.
- if (shouldUseFormStridedPseudo(MI))
- return;
-
- const TargetInstrInfo *TII = Subtarget->getInstrInfo();
- MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
- TII->get(TargetOpcode::REG_SEQUENCE),
- MI.getOperand(0).getReg());
-
- for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
- MIB.add(MI.getOperand(I));
- MIB.addImm(AArch64::zsub0 + (I - 1));
- }
-
- MI.eraseFromParent();
- return;
- }
-
// Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
// have nothing to do with VG, were it not that they are used to materialise a
// frame-address. If they contain a frame-index to a scalable vector, this
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 3c57ba4..a0928b9 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -428,7 +428,6 @@ def SDT_AArch64cbz : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisVT<1, OtherVT>]>;
def SDT_AArch64tbz : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>,
SDTCisVT<2, OtherVT>]>;
-
def SDT_AArch64CSel : SDTypeProfile<1, 4,
[SDTCisSameAs<0, 1>,
SDTCisSameAs<0, 2>,
@@ -451,6 +450,7 @@ def SDT_AArch64FCCMP : SDTypeProfile<1, 5,
def SDT_AArch64FCmp : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
SDTCisFP<1>,
SDTCisSameAs<2, 1>]>;
+def SDT_AArch64Rev : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>]>;
def SDT_AArch64Dup : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
def SDT_AArch64DupLane : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<2>]>;
def SDT_AArch64Insr : SDTypeProfile<1, 2, [SDTCisVec<0>]>;
@@ -817,11 +817,9 @@ def AArch64mvni_msl : SDNode<"AArch64ISD::MVNImsl", SDT_AArch64MOVIshift>;
def AArch64movi : SDNode<"AArch64ISD::MOVI", SDT_AArch64MOVIedit>;
def AArch64fmov : SDNode<"AArch64ISD::FMOV", SDT_AArch64MOVIedit>;
-def AArch64rev16_scalar : SDNode<"AArch64ISD::REV16", SDTIntUnaryOp>;
-
-def AArch64rev16 : SDNode<"AArch64ISD::REV16", SDT_AArch64UnaryVec>;
-def AArch64rev32 : SDNode<"AArch64ISD::REV32", SDT_AArch64UnaryVec>;
-def AArch64rev64 : SDNode<"AArch64ISD::REV64", SDT_AArch64UnaryVec>;
+def AArch64rev16 : SDNode<"AArch64ISD::REV16", SDT_AArch64Rev>;
+def AArch64rev32 : SDNode<"AArch64ISD::REV32", SDT_AArch64Rev>;
+def AArch64rev64 : SDNode<"AArch64ISD::REV64", SDT_AArch64Rev>;
def AArch64ext : SDNode<"AArch64ISD::EXT", SDT_AArch64ExtVec>;
def AArch64vashr : SDNode<"AArch64ISD::VASHR", SDT_AArch64vshift>;
@@ -3000,8 +2998,8 @@ def : Pat<(bswap (rotr GPR64:$Rn, (i64 32))), (REV32Xr GPR64:$Rn)>;
def : Pat<(srl (bswap top16Zero:$Rn), (i64 16)), (REV16Wr GPR32:$Rn)>;
def : Pat<(srl (bswap top32Zero:$Rn), (i64 32)), (REV32Xr GPR64:$Rn)>;
-def : Pat<(AArch64rev16_scalar GPR32:$Rn), (REV16Wr GPR32:$Rn)>;
-def : Pat<(AArch64rev16_scalar GPR64:$Rn), (REV16Xr GPR64:$Rn)>;
+def : Pat<(AArch64rev16 GPR32:$Rn), (REV16Wr GPR32:$Rn)>;
+def : Pat<(AArch64rev16 GPR64:$Rn), (REV16Xr GPR64:$Rn)>;
def : Pat<(or (and (srl GPR64:$Rn, (i64 8)), (i64 0x00ff00ff00ff00ff)),
(and (shl GPR64:$Rn, (i64 8)), (i64 0xff00ff00ff00ff00))),
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index aae2fda..a6edcf1 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -940,6 +940,16 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
}
break;
}
+ case Intrinsic::experimental_cttz_elts: {
+ EVT ArgVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
+ if (!getTLI()->shouldExpandCttzElements(ArgVT)) {
+ // This will consist of a SVE brkb and a cntp instruction. These
+ // typically have the same latency and half the throughput as a vector
+ // add instruction.
+ return 4;
+ }
+ break;
+ }
default:
break;
}
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 0ac131e..4f6a413 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -36,27 +36,26 @@ let WantsRoot = true in
def am_sme_indexed_b4 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexedSVE<0, 15>">;
// The FORM_TRANSPOSED_REG_TUPLE pseudos defined below are intended to
-// improve register allocation for intrinsics which use strided and contiguous
-// multi-vector registers, avoiding unnecessary copies.
-// If the operands of the pseudo are copies where the source register is in
-// the StridedOrContiguous class, the pseudo is used to provide a hint to the
-// register allocator suggesting a contigious multi-vector register which
-// matches the subregister sequence used by the operands.
-// If the operands do not match this pattern, the pseudos are expanded
-// to a REG_SEQUENCE using the post-isel hook.
+// improve register allocation for intrinsics which use strided and
+// contiguous multi-vector registers, avoiding unnecessary copies.
+// The SMEPeepholeOpt pass will replace a REG_SEQUENCE instruction with the
+// FORM_TRANSPOSED_REG_TUPLE pseudo if the operands are copies where the
+// source register is in the StridedOrContiguous class. The operands in the
+// sequence must all have the same subreg index.
+// The pseudo is then used to provide a hint to the register allocator
+// suggesting a contigious multi-vector register which matches the
+// subregister sequence used by the operands.
def FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO :
Pseudo<(outs ZPR2:$tup),
(ins ZPR:$zn0, ZPR:$zn1), []>, Sched<[]>{
let hasSideEffects = 0;
- let hasPostISelHook = 1;
}
def FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO :
Pseudo<(outs ZPR4:$tup),
(ins ZPR:$zn0, ZPR:$zn1, ZPR:$zn2, ZPR:$zn3), []>, Sched<[]>{
let hasSideEffects = 0;
- let hasPostISelHook = 1;
}
def SPILL_PPR_TO_ZPR_SLOT_PSEUDO :
@@ -178,14 +177,14 @@ class SME2_ZA_TwoOp_Multi_Single_Pat<string name, SDPatternOperator intrinsic, O
class SME2_ZA_TwoOp_VG2_Multi_Single_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ZPRRegOp zpr_ty,
ValueType vt, ComplexPattern tileslice>
: Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zm),
- (!cast<Instruction>(name # _PSEUDO) $base, $offset, (FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO vt:$Zn1, vt:$Zn2),
+ (!cast<Instruction>(name # _PSEUDO) $base, $offset, (REG_SEQUENCE ZPR2, vt:$Zn1, zsub0, vt:$Zn2, zsub1),
zpr_ty:$Zm)>;
class SME2_ZA_TwoOp_VG4_Multi_Single_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ZPRRegOp zpr_ty,
ValueType vt, ComplexPattern tileslice>
: Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)),
vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4, vt:$Zm),
(!cast<Instruction>(name # _PSEUDO) $base, $offset,
- (FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4),
+ (REG_SEQUENCE ZPR4, vt:$Zn1, zsub0, vt:$Zn2, zsub1, vt:$Zn3, zsub2, vt:$Zn4, zsub3),
zpr_ty:$Zm)>;
class SME2_ZA_TwoOp_VG2_Multi_Multi_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ValueType vt, ComplexPattern tileslice>
@@ -211,14 +210,14 @@ class SME2_ZA_TwoOp_VG2_Multi_Index_Pat<string name, SDPatternOperator intrinsic
Operand imm_ty, ComplexPattern tileslice>
: Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zm, (i32 imm_ty:$i)),
(!cast<Instruction>(name # _PSEUDO) $base, $offset,
- (FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO vt:$Zn1,vt:$Zn2), zpr_ty:$Zm, imm_ty:$i)>;
+ (REG_SEQUENCE ZPR2Mul2, vt:$Zn1, zsub0, vt:$Zn2, zsub1), zpr_ty:$Zm, imm_ty:$i)>;
class SME2_ZA_TwoOp_VG4_Multi_Index_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ZPRRegOp zpr_ty, ValueType vt,
Operand imm_ty, ComplexPattern tileslice>
: Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)),
vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4, vt:$Zm, (i32 imm_ty:$i)),
(!cast<Instruction>(name # _PSEUDO) $base, $offset,
- (FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4),
+ (REG_SEQUENCE ZPR4Mul4, vt:$Zn1, zsub0, vt:$Zn2, zsub1, vt:$Zn3, zsub2, vt:$Zn4, zsub3),
zpr_ty:$Zm, imm_ty:$i)>;
class SME2_Sat_Shift_VG2_Pat<string name, SDPatternOperator intrinsic, ValueType out_vt, ValueType in_vt, Operand imm_ty>
diff --git a/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp b/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp
index 4a0312d..2ffd4d7 100644
--- a/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp
@@ -45,6 +45,7 @@ struct SMEPeepholeOpt : public MachineFunctionPass {
bool optimizeStartStopPairs(MachineBasicBlock &MBB,
bool &HasRemovedAllSMChanges) const;
+ bool visitRegSequence(MachineInstr &MI);
};
char SMEPeepholeOpt::ID = 0;
@@ -225,6 +226,81 @@ bool SMEPeepholeOpt::optimizeStartStopPairs(
return Changed;
}
+// Using the FORM_TRANSPOSED_REG_TUPLE pseudo can improve register allocation
+// of multi-vector intrinsics. However, the psuedo should only be emitted if
+// the input registers of the REG_SEQUENCE are copy nodes where the source
+// register is in a StridedOrContiguous class. For example:
+//
+// %3:zpr2stridedorcontiguous = LD1B_2Z_IMM_PSEUDO ..
+// %4:zpr = COPY %3.zsub1:zpr2stridedorcontiguous
+// %5:zpr = COPY %3.zsub0:zpr2stridedorcontiguous
+// %6:zpr2stridedorcontiguous = LD1B_2Z_PSEUDO ..
+// %7:zpr = COPY %6.zsub1:zpr2stridedorcontiguous
+// %8:zpr = COPY %6.zsub0:zpr2stridedorcontiguous
+// %9:zpr2mul2 = REG_SEQUENCE %5:zpr, %subreg.zsub0, %8:zpr, %subreg.zsub1
+//
+// -> %9:zpr2mul2 = FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO %5:zpr, %8:zpr
+//
+bool SMEPeepholeOpt::visitRegSequence(MachineInstr &MI) {
+ assert(MI.getMF()->getRegInfo().isSSA() && "Expected to be run on SSA form!");
+
+ MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+ switch (MRI.getRegClass(MI.getOperand(0).getReg())->getID()) {
+ case AArch64::ZPR2RegClassID:
+ case AArch64::ZPR4RegClassID:
+ case AArch64::ZPR2Mul2RegClassID:
+ case AArch64::ZPR4Mul4RegClassID:
+ break;
+ default:
+ return false;
+ }
+
+ // The first operand is the register class created by the REG_SEQUENCE.
+ // Each operand pair after this consists of a vreg + subreg index, so
+ // for example a sequence of 2 registers will have a total of 5 operands.
+ if (MI.getNumOperands() != 5 && MI.getNumOperands() != 9)
+ return false;
+
+ MCRegister SubReg = MCRegister::NoRegister;
+ for (unsigned I = 1; I < MI.getNumOperands(); I += 2) {
+ MachineOperand &MO = MI.getOperand(I);
+
+ MachineOperand *Def = MRI.getOneDef(MO.getReg());
+ if (!Def || !Def->getParent()->isCopy())
+ return false;
+
+ const MachineOperand &CopySrc = Def->getParent()->getOperand(1);
+ unsigned OpSubReg = CopySrc.getSubReg();
+ if (SubReg == MCRegister::NoRegister)
+ SubReg = OpSubReg;
+
+ MachineOperand *CopySrcOp = MRI.getOneDef(CopySrc.getReg());
+ if (!CopySrcOp || !CopySrcOp->isReg() || OpSubReg != SubReg ||
+ CopySrcOp->getReg().isPhysical())
+ return false;
+
+ const TargetRegisterClass *CopySrcClass =
+ MRI.getRegClass(CopySrcOp->getReg());
+ if (CopySrcClass != &AArch64::ZPR2StridedOrContiguousRegClass &&
+ CopySrcClass != &AArch64::ZPR4StridedOrContiguousRegClass)
+ return false;
+ }
+
+ unsigned Opc = MI.getNumOperands() == 5
+ ? AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO
+ : AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO;
+
+ const TargetInstrInfo *TII =
+ MI.getMF()->getSubtarget<AArch64Subtarget>().getInstrInfo();
+ MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
+ TII->get(Opc), MI.getOperand(0).getReg());
+ for (unsigned I = 1; I < MI.getNumOperands(); I += 2)
+ MIB.addReg(MI.getOperand(I).getReg());
+
+ MI.eraseFromParent();
+ return true;
+}
+
INITIALIZE_PASS(SMEPeepholeOpt, "aarch64-sme-peephole-opt",
"SME Peephole Optimization", false, false)
@@ -247,6 +323,12 @@ bool SMEPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
bool BlockHasAllSMChangesRemoved;
Changed |= optimizeStartStopPairs(MBB, BlockHasAllSMChangesRemoved);
FunctionHasAllSMChangesRemoved |= BlockHasAllSMChangesRemoved;
+
+ if (MF.getSubtarget<AArch64Subtarget>().isStreaming()) {
+ for (MachineInstr &MI : make_early_inc_range(MBB))
+ if (MI.getOpcode() == AArch64::REG_SEQUENCE)
+ Changed |= visitRegSequence(MI);
+ }
}
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index cca9fa7..792e17e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4217,18 +4217,21 @@ SDValue AMDGPUTargetLowering::performTruncateCombine(
// trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
if (auto *K = isConstOrConstSplat(Src.getOperand(1))) {
- if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
- SDValue BV = stripBitcast(Src.getOperand(0));
- if (BV.getOpcode() == ISD::BUILD_VECTOR &&
- BV.getValueType().getVectorNumElements() == 2) {
- SDValue SrcElt = BV.getOperand(1);
- EVT SrcEltVT = SrcElt.getValueType();
- if (SrcEltVT.isFloatingPoint()) {
- SrcElt = DAG.getNode(ISD::BITCAST, SL,
- SrcEltVT.changeTypeToInteger(), SrcElt);
+ SDValue BV = stripBitcast(Src.getOperand(0));
+ if (BV.getOpcode() == ISD::BUILD_VECTOR) {
+ EVT SrcEltVT = BV.getOperand(0).getValueType();
+ unsigned SrcEltSize = SrcEltVT.getSizeInBits();
+ unsigned BitIndex = K->getZExtValue();
+ unsigned PartIndex = BitIndex / SrcEltSize;
+
+ if (PartIndex * SrcEltSize == BitIndex &&
+ PartIndex < BV.getNumOperands()) {
+ if (SrcEltVT.getSizeInBits() == VT.getSizeInBits()) {
+ SDValue SrcElt =
+ DAG.getNode(ISD::BITCAST, SL, SrcEltVT.changeTypeToInteger(),
+ BV.getOperand(PartIndex));
+ return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
}
-
- return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
}
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 5bfd891..09f7877 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -416,8 +416,6 @@ int64_t GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
return 1024;
}
-// FIXME: Should we use narrower types for local/region, or account for when
-// unaligned access is legal?
Type *GCNTTIImpl::getMemcpyLoopLoweringType(
LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
@@ -426,29 +424,12 @@ Type *GCNTTIImpl::getMemcpyLoopLoweringType(
if (AtomicElementSize)
return Type::getIntNTy(Context, *AtomicElementSize * 8);
- Align MinAlign = std::min(SrcAlign, DestAlign);
-
- // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the
- // hardware into byte accesses. If you assume all alignments are equally
- // probable, it's more efficient on average to use short accesses for this
- // case.
- if (MinAlign == Align(2))
- return Type::getInt16Ty(Context);
-
- // Not all subtargets have 128-bit DS instructions, and we currently don't
- // form them by default.
- if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
- SrcAddrSpace == AMDGPUAS::REGION_ADDRESS ||
- DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
- DestAddrSpace == AMDGPUAS::REGION_ADDRESS) {
- return FixedVectorType::get(Type::getInt32Ty(Context), 2);
- }
-
- // Global memory works best with 16-byte accesses.
+ // 16-byte accesses achieve the highest copy throughput.
// If the operation has a fixed known length that is large enough, it is
// worthwhile to return an even wider type and let legalization lower it into
- // multiple accesses, effectively unrolling the memcpy loop. Private memory
- // also hits this, although accesses may be decomposed.
+ // multiple accesses, effectively unrolling the memcpy loop.
+ // We also rely on legalization to decompose into smaller accesses for
+ // subtargets and address spaces where it is necessary.
//
// Don't unroll if Length is not a constant, since unrolling leads to worse
// performance for length values that are smaller or slightly larger than the
@@ -473,26 +454,22 @@ void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
DestAlign, AtomicCpySize);
- Align MinAlign = std::min(SrcAlign, DestAlign);
-
- if (MinAlign != Align(2)) {
- Type *I32x4Ty = FixedVectorType::get(Type::getInt32Ty(Context), 4);
- while (RemainingBytes >= 16) {
- OpsOut.push_back(I32x4Ty);
- RemainingBytes -= 16;
- }
+ Type *I32x4Ty = FixedVectorType::get(Type::getInt32Ty(Context), 4);
+ while (RemainingBytes >= 16) {
+ OpsOut.push_back(I32x4Ty);
+ RemainingBytes -= 16;
+ }
- Type *I64Ty = Type::getInt64Ty(Context);
- while (RemainingBytes >= 8) {
- OpsOut.push_back(I64Ty);
- RemainingBytes -= 8;
- }
+ Type *I64Ty = Type::getInt64Ty(Context);
+ while (RemainingBytes >= 8) {
+ OpsOut.push_back(I64Ty);
+ RemainingBytes -= 8;
+ }
- Type *I32Ty = Type::getInt32Ty(Context);
- while (RemainingBytes >= 4) {
- OpsOut.push_back(I32Ty);
- RemainingBytes -= 4;
- }
+ Type *I32Ty = Type::getInt32Ty(Context);
+ while (RemainingBytes >= 4) {
+ OpsOut.push_back(I32Ty);
+ RemainingBytes -= 4;
}
Type *I16Ty = Type::getInt16Ty(Context);
diff --git a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
index a20319e..ac11526 100644
--- a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
+++ b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
@@ -287,10 +287,10 @@ bool R600VectorRegMerger::tryMergeUsingFreeSlot(RegSeqInfo &RSI,
RegSeqInfo &CompatibleRSI,
std::vector<std::pair<unsigned, unsigned>> &RemapChan) {
unsigned NeededUndefs = 4 - RSI.UndefReg.size();
- if (PreviousRegSeqByUndefCount[NeededUndefs].empty())
- return false;
std::vector<MachineInstr *> &MIs =
PreviousRegSeqByUndefCount[NeededUndefs];
+ if (MIs.empty())
+ return false;
CompatibleRSI = PreviousRegSeq[MIs.back()];
tryMergeVector(&CompatibleRSI, &RSI, RemapChan);
return true;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index bee4c47..6e08aff 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2703,15 +2703,20 @@ class FPToI1Pat<Instruction Inst, int KOne, ValueType kone_type, ValueType vt, S
(i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE))
>;
-let OtherPredicates = [NotHasTrue16BitInsts] in {
+let True16Predicate = NotHasTrue16BitInsts in {
def : FPToI1Pat<V_CMP_EQ_F16_e64, CONST.FP16_ONE, i16, f16, fp_to_uint>;
def : FPToI1Pat<V_CMP_EQ_F16_e64, CONST.FP16_NEG_ONE, i16, f16, fp_to_sint>;
-} // end OtherPredicates = [NotHasTrue16BitInsts]
+} // end True16Predicate = NotHasTrue16BitInsts
+
+let True16Predicate = UseRealTrue16Insts in {
+ def : FPToI1Pat<V_CMP_EQ_F16_t16_e64, CONST.FP16_ONE, i16, f16, fp_to_uint>;
+ def : FPToI1Pat<V_CMP_EQ_F16_t16_e64, CONST.FP16_NEG_ONE, i16, f16, fp_to_sint>;
+} // end True16Predicate = UseRealTrue16BitInsts
-let OtherPredicates = [HasTrue16BitInsts] in {
+let True16Predicate = UseFakeTrue16Insts in {
def : FPToI1Pat<V_CMP_EQ_F16_fake16_e64, CONST.FP16_ONE, i16, f16, fp_to_uint>;
def : FPToI1Pat<V_CMP_EQ_F16_fake16_e64, CONST.FP16_NEG_ONE, i16, f16, fp_to_sint>;
-} // end OtherPredicates = [HasTrue16BitInsts]
+} // end True16Predicate = UseFakeTrue16BitInsts
def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, i32, f32, fp_to_uint>;
def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_NEG_ONE, i32, f32, fp_to_sint>;
@@ -3790,6 +3795,13 @@ def : FPMinCanonMaxPat<V_MINMAX_F32_e64, f32, fmaxnum_like, fminnum_like_oneuse>
def : FPMinCanonMaxPat<V_MAXMIN_F32_e64, f32, fminnum_like, fmaxnum_like_oneuse>;
}
+let True16Predicate = UseRealTrue16Insts in {
+def : FPMinMaxPat<V_MINMAX_F16_t16_e64, f16, fmaxnum_like, fminnum_like_oneuse>;
+def : FPMinMaxPat<V_MAXMIN_F16_t16_e64, f16, fminnum_like, fmaxnum_like_oneuse>;
+def : FPMinCanonMaxPat<V_MINMAX_F16_t16_e64, f16, fmaxnum_like, fminnum_like_oneuse>;
+def : FPMinCanonMaxPat<V_MAXMIN_F16_t16_e64, f16, fminnum_like, fmaxnum_like_oneuse>;
+}
+
let True16Predicate = UseFakeTrue16Insts in {
def : FPMinMaxPat<V_MINMAX_F16_fake16_e64, f16, fmaxnum_like, fminnum_like_oneuse>;
def : FPMinMaxPat<V_MAXMIN_F16_fake16_e64, f16, fminnum_like, fmaxnum_like_oneuse>;
@@ -3819,6 +3831,13 @@ def : FPMinCanonMaxPat<V_MINIMUMMAXIMUM_F32_e64, f32, DivergentBinFrag<fmaximum>
def : FPMinCanonMaxPat<V_MAXIMUMMINIMUM_F32_e64, f32, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
}
+let True16Predicate = UseRealTrue16Insts, SubtargetPredicate = isGFX12Plus in {
+def : FPMinMaxPat<V_MINIMUMMAXIMUM_F16_t16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
+def : FPMinMaxPat<V_MAXIMUMMINIMUM_F16_t16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
+def : FPMinCanonMaxPat<V_MINIMUMMAXIMUM_F16_t16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
+def : FPMinCanonMaxPat<V_MAXIMUMMINIMUM_F16_t16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
+}
+
let True16Predicate = UseFakeTrue16Insts, SubtargetPredicate = isGFX12Plus in {
def : FPMinMaxPat<V_MINIMUMMAXIMUM_F16_fake16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
def : FPMinMaxPat<V_MAXIMUMMINIMUM_F16_fake16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index fad7e67..67bebfb3 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -976,8 +976,7 @@ struct Waitcnt {
Waitcnt() = default;
// Pre-gfx12 constructor.
Waitcnt(unsigned VmCnt, unsigned ExpCnt, unsigned LgkmCnt, unsigned VsCnt)
- : LoadCnt(VmCnt), ExpCnt(ExpCnt), DsCnt(LgkmCnt), StoreCnt(VsCnt),
- SampleCnt(~0u), BvhCnt(~0u), KmCnt(~0u) {}
+ : LoadCnt(VmCnt), ExpCnt(ExpCnt), DsCnt(LgkmCnt), StoreCnt(VsCnt) {}
// gfx12+ constructor.
Waitcnt(unsigned LoadCnt, unsigned ExpCnt, unsigned DsCnt, unsigned StoreCnt,
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
index 1e76bf7..296031e 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
@@ -27,6 +27,28 @@
using namespace llvm;
using namespace llvm::AMDGPU;
+// Return the PAL metadata hardware shader stage name.
+static const char *getStageName(CallingConv::ID CC) {
+ switch (CC) {
+ case CallingConv::AMDGPU_PS:
+ return ".ps";
+ case CallingConv::AMDGPU_VS:
+ return ".vs";
+ case CallingConv::AMDGPU_GS:
+ return ".gs";
+ case CallingConv::AMDGPU_ES:
+ return ".es";
+ case CallingConv::AMDGPU_HS:
+ return ".hs";
+ case CallingConv::AMDGPU_LS:
+ return ".ls";
+ case CallingConv::AMDGPU_Gfx:
+ llvm_unreachable("Callable shader has no hardware stage");
+ default:
+ return ".cs";
+ }
+}
+
// Read the PAL metadata from IR metadata, where it was put by the frontend.
void AMDGPUPALMetadata::readFromIR(Module &M) {
auto *NamedMD = M.getNamedMetadata("amdgpu.pal.metadata.msgpack");
@@ -232,8 +254,18 @@ void AMDGPUPALMetadata::setEntryPoint(unsigned CC, StringRef Name) {
if (isLegacy())
return;
// Msgpack format.
+ // Entry point is updated to .entry_point_symbol and is set to the function
+ // name
getHwStage(CC)[".entry_point_symbol"] =
MsgPackDoc.getNode(Name, /*Copy=*/true);
+
+ // Set .entry_point which is defined
+ // to be _amdgpu_<stage> and _amdgpu_cs for non-shader functions
+ SmallString<16> EPName("_amdgpu_");
+ raw_svector_ostream EPNameOS(EPName);
+ EPNameOS << getStageName(CC) + 1;
+ getHwStage(CC)[".entry_point"] =
+ MsgPackDoc.getNode(EPNameOS.str(), /*Copy=*/true);
}
// Set the number of used vgprs in the metadata. This is an optional
@@ -943,28 +975,6 @@ msgpack::MapDocNode AMDGPUPALMetadata::getGraphicsRegisters() {
return GraphicsRegisters.getMap();
}
-// Return the PAL metadata hardware shader stage name.
-static const char *getStageName(CallingConv::ID CC) {
- switch (CC) {
- case CallingConv::AMDGPU_PS:
- return ".ps";
- case CallingConv::AMDGPU_VS:
- return ".vs";
- case CallingConv::AMDGPU_GS:
- return ".gs";
- case CallingConv::AMDGPU_ES:
- return ".es";
- case CallingConv::AMDGPU_HS:
- return ".hs";
- case CallingConv::AMDGPU_LS:
- return ".ls";
- case CallingConv::AMDGPU_Gfx:
- llvm_unreachable("Callable shader has no hardware stage");
- default:
- return ".cs";
- }
-}
-
msgpack::DocNode &AMDGPUPALMetadata::refHwStage() {
auto &N =
MsgPackDoc.getRoot()
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 633a99d..74def43 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -163,6 +163,7 @@ def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">;
def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">;
def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">;
+def hasTcgen05Instructions : Predicate<"Subtarget->hasTcgen05Instructions()">;
def True : Predicate<"true">;
def False : Predicate<"false">;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 56d8b73..a0d00e4 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -7582,3 +7582,44 @@ def GRIDDEPCONTROL_WAIT :
Requires<[hasSM<90>, hasPTX<78>]>;
def INT_EXIT : NVPTXInst<(outs), (ins), "exit;", [(int_nvvm_exit)]>;
+
+// Tcgen05 intrinsics
+let isConvergent = true in {
+
+multiclass TCGEN05_ALLOC_INTR<NVPTXRegClass rc, string AS, string num, Intrinsic Intr> {
+ def NAME : NVPTXInst<(outs),
+ (ins rc:$dst, Int32Regs:$ncols),
+ !strconcat("tcgen05.alloc.cta_group::", num, ".sync.aligned", AS, ".b32 [$dst], $ncols;"),
+ [(Intr rc:$dst, Int32Regs:$ncols)]>,
+ Requires<[hasTcgen05Instructions]>;
+}
+
+defm TCGEN05_ALLOC_CG1 : TCGEN05_ALLOC_INTR<Int64Regs, "", "1", int_nvvm_tcgen05_alloc_cg1>;
+defm TCGEN05_ALLOC_CG2 : TCGEN05_ALLOC_INTR<Int64Regs, "", "2", int_nvvm_tcgen05_alloc_cg2>;
+
+defm TCGEN05_ALLOC_S64_CG1 : TCGEN05_ALLOC_INTR<Int64Regs, ".shared::cta", "1", int_nvvm_tcgen05_alloc_shared_cg1>;
+defm TCGEN05_ALLOC_S64_CG2 : TCGEN05_ALLOC_INTR<Int64Regs, ".shared::cta", "2", int_nvvm_tcgen05_alloc_shared_cg2>;
+
+defm TCGEN05_ALLOC_S32_CG1 : TCGEN05_ALLOC_INTR<Int32Regs, ".shared::cta", "1", int_nvvm_tcgen05_alloc_shared_cg1>;
+defm TCGEN05_ALLOC_S32_CG2 : TCGEN05_ALLOC_INTR<Int32Regs, ".shared::cta", "2", int_nvvm_tcgen05_alloc_shared_cg2>;
+
+multiclass TCGEN05_DEALLOC_INTR<string num, Intrinsic Intr> {
+ def NAME : NVPTXInst<(outs),
+ (ins Int32Regs:$tmem_addr, Int32Regs:$ncols),
+ !strconcat("tcgen05.dealloc.cta_group::", num, ".sync.aligned.b32 $tmem_addr, $ncols;"),
+ [(Intr Int32Regs:$tmem_addr, Int32Regs:$ncols)]>,
+ Requires<[hasTcgen05Instructions]>;
+}
+defm TCGEN05_DEALLOC_CG1: TCGEN05_DEALLOC_INTR<"1", int_nvvm_tcgen05_dealloc_cg1>;
+defm TCGEN05_DEALLOC_CG2: TCGEN05_DEALLOC_INTR<"2", int_nvvm_tcgen05_dealloc_cg2>;
+
+multiclass TCGEN05_RELINQ_PERMIT_INTR<string num, Intrinsic Intr> {
+ def NAME : NVPTXInst<(outs), (ins),
+ !strconcat("tcgen05.relinquish_alloc_permit.cta_group::", num, ".sync.aligned;"),
+ [(Intr)]>,
+ Requires<[hasTcgen05Instructions]>;
+}
+defm TCGEN05_RELINQ_CG1: TCGEN05_RELINQ_PERMIT_INTR<"1", int_nvvm_tcgen05_relinq_alloc_permit_cg1>;
+defm TCGEN05_RELINQ_CG2: TCGEN05_RELINQ_PERMIT_INTR<"2", int_nvvm_tcgen05_relinq_alloc_permit_cg2>;
+
+} // isConvergent
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 919f487..0c4420b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -93,6 +93,21 @@ public:
bool hasDotInstructions() const {
return SmVersion >= 61 && PTXVersion >= 50;
}
+ // Tcgen05 instructions in Blackwell family
+ bool hasTcgen05Instructions() const {
+ bool HasTcgen05 = false;
+ switch (FullSmVersion) {
+ default:
+ break;
+ case 1001: // sm_100a
+ case 1011: // sm_101a
+ HasTcgen05 = true;
+ break;
+ }
+
+ return HasTcgen05 && PTXVersion >= 86;
+ }
+
// Prior to CUDA 12.3 ptxas did not recognize that the trap instruction
// terminates a basic block. Instead, it would assume that control flow
// continued to the next instruction. The next instruction could be in the
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index e88027f..f2afa6f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -140,6 +140,9 @@ static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) {
else if (UseShortPointers)
Ret += "-p3:32:32-p4:32:32-p5:32:32";
+ // Tensor Memory (addrspace:6) is always 32-bits.
+ Ret += "-p6:32:32";
+
Ret += "-i64:64-i128:128-v16:16-v32:32-n16:32:64";
return Ret;
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
index 36b8a24..274c7cb 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -65,9 +65,10 @@ public:
/// for a given imm form load/store opcode \p ImmFormOpcode.
/// FIXME: move this to PPCInstrInfo class.
unsigned getMappedIdxOpcForImmOpc(unsigned ImmOpcode) const {
- if (!ImmToIdxMap.count(ImmOpcode))
+ auto It = ImmToIdxMap.find(ImmOpcode);
+ if (It == ImmToIdxMap.end())
return PPC::INSTRUCTION_LIST_END;
- return ImmToIdxMap.find(ImmOpcode)->second;
+ return It->second;
}
/// getPointerRegClass - Return the register class to use to hold pointers.
diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.td b/llvm/lib/Target/RISCV/RISCVCallingConv.td
index ad06f47..98e05b7 100644
--- a/llvm/lib/Target/RISCV/RISCVCallingConv.td
+++ b/llvm/lib/Target/RISCV/RISCVCallingConv.td
@@ -42,6 +42,8 @@ def CSR_ILP32D_LP64D_V
// Needed for implementation of RISCVRegisterInfo::getNoPreservedMask()
def CSR_NoRegs : CalleeSavedRegs<(add)>;
+def CSR_IPRA : CalleeSavedRegs<(add X1)>;
+
// Interrupt handler needs to save/restore all registers that are used,
// both Caller and Callee saved registers.
def CSR_Interrupt : CalleeSavedRegs<(add X1, (sequence "X%u", 5, 31))>;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 8e3caf5..7c3b583 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -17759,6 +17759,83 @@ static SDValue combineScalarCTPOPToVCPOP(SDNode *N, SelectionDAG &DAG,
return DAG.getZExtOrTrunc(Pop, DL, VT);
}
+static SDValue performSHLCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const RISCVSubtarget &Subtarget) {
+ // (shl (zext x), y) -> (vwsll x, y)
+ if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget))
+ return V;
+
+ // (shl (sext x), C) -> (vwmulsu x, 1u << C)
+ // (shl (zext x), C) -> (vwmulu x, 1u << C)
+
+ if (!DCI.isAfterLegalizeDAG())
+ return SDValue();
+
+ SDValue LHS = N->getOperand(0);
+ if (!LHS.hasOneUse())
+ return SDValue();
+ unsigned Opcode;
+ switch (LHS.getOpcode()) {
+ case ISD::SIGN_EXTEND:
+ case RISCVISD::VSEXT_VL:
+ Opcode = RISCVISD::VWMULSU_VL;
+ break;
+ case ISD::ZERO_EXTEND:
+ case RISCVISD::VZEXT_VL:
+ Opcode = RISCVISD::VWMULU_VL;
+ break;
+ default:
+ return SDValue();
+ }
+
+ SDValue RHS = N->getOperand(1);
+ APInt ShAmt;
+ uint64_t ShAmtInt;
+ if (ISD::isConstantSplatVector(RHS.getNode(), ShAmt))
+ ShAmtInt = ShAmt.getZExtValue();
+ else if (RHS.getOpcode() == RISCVISD::VMV_V_X_VL &&
+ RHS.getOperand(1).getOpcode() == ISD::Constant)
+ ShAmtInt = RHS.getConstantOperandVal(1);
+ else
+ return SDValue();
+
+ // Better foldings:
+ // (shl (sext x), 1) -> (vwadd x, x)
+ // (shl (zext x), 1) -> (vwaddu x, x)
+ if (ShAmtInt <= 1)
+ return SDValue();
+
+ SDValue NarrowOp = LHS.getOperand(0);
+ MVT NarrowVT = NarrowOp.getSimpleValueType();
+ uint64_t NarrowBits = NarrowVT.getScalarSizeInBits();
+ if (ShAmtInt >= NarrowBits)
+ return SDValue();
+ MVT VT = N->getSimpleValueType(0);
+ if (NarrowBits * 2 != VT.getScalarSizeInBits())
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc DL(N);
+ SDValue Passthru, Mask, VL;
+ switch (N->getOpcode()) {
+ case ISD::SHL:
+ Passthru = DAG.getUNDEF(VT);
+ std::tie(Mask, VL) = getDefaultScalableVLOps(VT, DL, DAG, Subtarget);
+ break;
+ case RISCVISD::SHL_VL:
+ Passthru = N->getOperand(2);
+ Mask = N->getOperand(3);
+ VL = N->getOperand(4);
+ break;
+ default:
+ llvm_unreachable("Expected SHL");
+ }
+ return DAG.getNode(Opcode, DL, VT, NarrowOp,
+ DAG.getConstant(1ULL << ShAmtInt, SDLoc(RHS), NarrowVT),
+ Passthru, Mask, VL);
+}
+
SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -18392,7 +18469,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
break;
}
case RISCVISD::SHL_VL:
- if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget))
+ if (SDValue V = performSHLCombine(N, DCI, Subtarget))
return V;
[[fallthrough]];
case RISCVISD::SRA_VL:
@@ -18417,7 +18494,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::SRL:
case ISD::SHL: {
if (N->getOpcode() == ISD::SHL) {
- if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget))
+ if (SDValue V = performSHLCombine(N, DCI, Subtarget))
return V;
}
SDValue ShAmt = N->getOperand(1);
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index b0a5269..7a99bfd 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -56,6 +56,11 @@ RISCVRegisterInfo::RISCVRegisterInfo(unsigned HwMode)
/*PC*/0, HwMode) {}
const MCPhysReg *
+RISCVRegisterInfo::getIPRACSRegs(const MachineFunction *MF) const {
+ return CSR_IPRA_SaveList;
+}
+
+const MCPhysReg *
RISCVRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
auto &Subtarget = MF->getSubtarget<RISCVSubtarget>();
if (MF->getFunction().getCallingConv() == CallingConv::GHC)
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
index 3ab79694..6c4e9c7 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
@@ -62,6 +62,8 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo {
const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+ const MCPhysReg *getIPRACSRegs(const MachineFunction *MF) const override;
+
BitVector getReservedRegs(const MachineFunction &MF) const override;
bool isAsmClobberable(const MachineFunction &MF,
MCRegister PhysReg) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index fa7c7c5..cb2ec1d 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -940,6 +940,44 @@ InstructionCost RISCVTTIImpl::getGatherScatterOpCost(
return NumLoads * MemOpCost;
}
+InstructionCost RISCVTTIImpl::getExpandCompressMemoryOpCost(
+ unsigned Opcode, Type *DataTy, bool VariableMask, Align Alignment,
+ TTI::TargetCostKind CostKind, const Instruction *I) {
+ bool IsLegal = (Opcode == Instruction::Store &&
+ isLegalMaskedCompressStore(DataTy, Alignment)) ||
+ (Opcode == Instruction::Load &&
+ isLegalMaskedExpandLoad(DataTy, Alignment));
+ if (!IsLegal || CostKind != TTI::TCK_RecipThroughput)
+ return BaseT::getExpandCompressMemoryOpCost(Opcode, DataTy, VariableMask,
+ Alignment, CostKind, I);
+ // Example compressstore sequence:
+ // vsetivli zero, 8, e32, m2, ta, ma (ignored)
+ // vcompress.vm v10, v8, v0
+ // vcpop.m a1, v0
+ // vsetvli zero, a1, e32, m2, ta, ma
+ // vse32.v v10, (a0)
+ // Example expandload sequence:
+ // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
+ // vcpop.m a1, v0
+ // vsetvli zero, a1, e32, m2, ta, ma
+ // vle32.v v10, (a0)
+ // vsetivli zero, 8, e32, m2, ta, ma
+ // viota.m v12, v0
+ // vrgather.vv v8, v10, v12, v0.t
+ auto MemOpCost =
+ getMemoryOpCost(Opcode, DataTy, Alignment, /*AddressSpace*/ 0, CostKind);
+ auto LT = getTypeLegalizationCost(DataTy);
+ SmallVector<unsigned, 4> Opcodes{RISCV::VSETVLI};
+ if (VariableMask)
+ Opcodes.push_back(RISCV::VCPOP_M);
+ if (Opcode == Instruction::Store)
+ Opcodes.append({RISCV::VCOMPRESS_VM});
+ else
+ Opcodes.append({RISCV::VSETIVLI, RISCV::VIOTA_M, RISCV::VRGATHER_VV});
+ return MemOpCost +
+ LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
+}
+
InstructionCost RISCVTTIImpl::getStridedMemoryOpCost(
unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 042530b..5389e9b 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -174,6 +174,12 @@ public:
TTI::TargetCostKind CostKind,
const Instruction *I);
+ InstructionCost getExpandCompressMemoryOpCost(unsigned Opcode, Type *Src,
+ bool VariableMask,
+ Align Alignment,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr);
+
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy,
const Value *Ptr, bool VariableMask,
Align Alignment,
diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index 78db841..c202f7f 100644
--- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -284,7 +284,7 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
// Adjust stack pointer.
int StackAdj = StackAdjust.getImm();
int MaxTCDelta = X86FI->getTCReturnAddrDelta();
- int Offset = 0;
+ int64_t Offset = 0;
assert(MaxTCDelta <= 0 && "MaxTCDelta should never be positive");
// Incoporate the retaddr area.
@@ -297,7 +297,7 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
if (Offset) {
// Check for possible merge with preceding ADD instruction.
- Offset += X86FL->mergeSPUpdates(MBB, MBBI, true);
+ Offset = X86FL->mergeSPAdd(MBB, MBBI, Offset, true);
X86FL->emitSPUpdate(MBB, MBBI, DL, Offset, /*InEpilogue=*/true);
}
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index a15db03..50c56c9 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -223,6 +223,8 @@ flagsNeedToBePreservedBeforeTheTerminators(const MachineBasicBlock &MBB) {
return false;
}
+constexpr int64_t MaxSPChunk = (1LL << 31) - 1;
+
/// emitSPUpdate - Emit a series of instructions to increment / decrement the
/// stack pointer by a constant value.
void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
@@ -242,7 +244,7 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
return;
}
- uint64_t Chunk = (1LL << 31) - 1;
+ uint64_t Chunk = MaxSPChunk;
MachineFunction &MF = *MBB.getParent();
const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
@@ -391,12 +393,15 @@ MachineInstrBuilder X86FrameLowering::BuildStackAdjustment(
return MI;
}
-int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator &MBBI,
- bool doMergeWithPrevious) const {
+template <typename FoundT, typename CalcT>
+int64_t X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ FoundT FoundStackAdjust,
+ CalcT CalcNewOffset,
+ bool doMergeWithPrevious) const {
if ((doMergeWithPrevious && MBBI == MBB.begin()) ||
(!doMergeWithPrevious && MBBI == MBB.end()))
- return 0;
+ return CalcNewOffset(0);
MachineBasicBlock::iterator PI = doMergeWithPrevious ? std::prev(MBBI) : MBBI;
@@ -415,27 +420,38 @@ int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
if (doMergeWithPrevious && PI != MBB.begin() && PI->isCFIInstruction())
PI = std::prev(PI);
- unsigned Opc = PI->getOpcode();
- int Offset = 0;
-
- if ((Opc == X86::ADD64ri32 || Opc == X86::ADD32ri) &&
- PI->getOperand(0).getReg() == StackPtr) {
- assert(PI->getOperand(1).getReg() == StackPtr);
- Offset = PI->getOperand(2).getImm();
- } else if ((Opc == X86::LEA32r || Opc == X86::LEA64_32r) &&
- PI->getOperand(0).getReg() == StackPtr &&
- PI->getOperand(1).getReg() == StackPtr &&
- PI->getOperand(2).getImm() == 1 &&
- PI->getOperand(3).getReg() == X86::NoRegister &&
- PI->getOperand(5).getReg() == X86::NoRegister) {
- // For LEAs we have: def = lea SP, FI, noreg, Offset, noreg.
- Offset = PI->getOperand(4).getImm();
- } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB32ri) &&
- PI->getOperand(0).getReg() == StackPtr) {
- assert(PI->getOperand(1).getReg() == StackPtr);
- Offset = -PI->getOperand(2).getImm();
- } else
- return 0;
+ int64_t Offset = 0;
+ for (;;) {
+ unsigned Opc = PI->getOpcode();
+
+ if ((Opc == X86::ADD64ri32 || Opc == X86::ADD32ri) &&
+ PI->getOperand(0).getReg() == StackPtr) {
+ assert(PI->getOperand(1).getReg() == StackPtr);
+ Offset = PI->getOperand(2).getImm();
+ } else if ((Opc == X86::LEA32r || Opc == X86::LEA64_32r) &&
+ PI->getOperand(0).getReg() == StackPtr &&
+ PI->getOperand(1).getReg() == StackPtr &&
+ PI->getOperand(2).getImm() == 1 &&
+ PI->getOperand(3).getReg() == X86::NoRegister &&
+ PI->getOperand(5).getReg() == X86::NoRegister) {
+ // For LEAs we have: def = lea SP, FI, noreg, Offset, noreg.
+ Offset = PI->getOperand(4).getImm();
+ } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB32ri) &&
+ PI->getOperand(0).getReg() == StackPtr) {
+ assert(PI->getOperand(1).getReg() == StackPtr);
+ Offset = -PI->getOperand(2).getImm();
+ } else
+ return CalcNewOffset(0);
+
+ FoundStackAdjust(PI, Offset);
+ if (std::abs((int64_t)CalcNewOffset(Offset)) < MaxSPChunk)
+ break;
+
+ if (doMergeWithPrevious ? (PI == MBB.begin()) : (PI == MBB.end()))
+ return CalcNewOffset(0);
+
+ PI = doMergeWithPrevious ? std::prev(PI) : std::next(PI);
+ }
PI = MBB.erase(PI);
if (PI != MBB.end() && PI->isCFIInstruction()) {
@@ -448,7 +464,16 @@ int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
if (!doMergeWithPrevious)
MBBI = skipDebugInstructionsForward(PI, MBB.end());
- return Offset;
+ return CalcNewOffset(Offset);
+}
+
+int64_t X86FrameLowering::mergeSPAdd(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ int64_t AddOffset,
+ bool doMergeWithPrevious) const {
+ return mergeSPUpdates(
+ MBB, MBBI, [AddOffset](int64_t Offset) { return AddOffset + Offset; },
+ doMergeWithPrevious);
}
void X86FrameLowering::BuildCFI(MachineBasicBlock &MBB,
@@ -1975,8 +2000,10 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
// If there is an SUB32ri of ESP immediately before this instruction, merge
// the two. This can be the case when tail call elimination is enabled and
- // the callee has more arguments then the caller.
- NumBytes -= mergeSPUpdates(MBB, MBBI, true);
+ // the callee has more arguments than the caller.
+ NumBytes = mergeSPUpdates(
+ MBB, MBBI, [NumBytes](int64_t Offset) { return NumBytes - Offset; },
+ true);
// Adjust stack pointer: ESP -= numbytes.
@@ -2457,7 +2484,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
if (HasFP) {
if (X86FI->hasSwiftAsyncContext()) {
// Discard the context.
- int Offset = 16 + mergeSPUpdates(MBB, MBBI, true);
+ int64_t Offset = mergeSPAdd(MBB, MBBI, 16, true);
emitSPUpdate(MBB, MBBI, DL, Offset, /*InEpilogue*/ true);
}
// Pop EBP.
@@ -2531,7 +2558,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
// If there is an ADD32ri or SUB32ri of ESP immediately before this
// instruction, merge the two instructions.
if (NumBytes || MFI.hasVarSizedObjects())
- NumBytes += mergeSPUpdates(MBB, MBBI, true);
+ NumBytes = mergeSPAdd(MBB, MBBI, NumBytes, true);
// If dynamic alloca is used, then reset esp to point to the last callee-saved
// slot before popping them off! Same applies for the case, when stack was
@@ -2612,11 +2639,11 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
if (Terminator == MBB.end() || !isTailCallOpcode(Terminator->getOpcode())) {
// Add the return addr area delta back since we are not tail calling.
- int Offset = -1 * X86FI->getTCReturnAddrDelta();
+ int64_t Offset = -1 * X86FI->getTCReturnAddrDelta();
assert(Offset >= 0 && "TCDelta should never be positive");
if (Offset) {
// Check for possible merge with preceding ADD instruction.
- Offset += mergeSPUpdates(MBB, Terminator, true);
+ Offset = mergeSPAdd(MBB, Terminator, Offset, true);
emitSPUpdate(MBB, Terminator, DL, Offset, /*InEpilogue=*/true);
}
}
@@ -3814,13 +3841,24 @@ MachineBasicBlock::iterator X86FrameLowering::eliminateCallFramePseudoInstr(
// Add Amount to SP to destroy a frame, or subtract to setup.
int64_t StackAdjustment = isDestroy ? Amount : -Amount;
+ int64_t CfaAdjustment = StackAdjustment;
if (StackAdjustment) {
// Merge with any previous or following adjustment instruction. Note: the
// instructions merged with here do not have CFI, so their stack
- // adjustments do not feed into CfaAdjustment.
- StackAdjustment += mergeSPUpdates(MBB, InsertPos, true);
- StackAdjustment += mergeSPUpdates(MBB, InsertPos, false);
+ // adjustments do not feed into CfaAdjustment
+
+ auto CalcCfaAdjust = [&CfaAdjustment](MachineBasicBlock::iterator PI,
+ int64_t Offset) {
+ CfaAdjustment += Offset;
+ };
+ auto CalcNewOffset = [&StackAdjustment](int64_t Offset) {
+ return StackAdjustment + Offset;
+ };
+ StackAdjustment =
+ mergeSPUpdates(MBB, InsertPos, CalcCfaAdjust, CalcNewOffset, true);
+ StackAdjustment =
+ mergeSPUpdates(MBB, InsertPos, CalcCfaAdjust, CalcNewOffset, false);
if (StackAdjustment) {
if (!(F.hasMinSize() &&
@@ -3830,7 +3868,7 @@ MachineBasicBlock::iterator X86FrameLowering::eliminateCallFramePseudoInstr(
}
}
- if (DwarfCFI && !hasFP(MF)) {
+ if (DwarfCFI && !hasFP(MF) && CfaAdjustment) {
// If we don't have FP, but need to generate unwind information,
// we need to set the correct CFA offset after the stack adjustment.
// How much we adjust the CFA offset depends on whether we're emitting
@@ -3838,14 +3876,11 @@ MachineBasicBlock::iterator X86FrameLowering::eliminateCallFramePseudoInstr(
// offset to be correct at each call site, while for debugging we want
// it to be more precise.
- int64_t CfaAdjustment = -StackAdjustment;
// TODO: When not using precise CFA, we also need to adjust for the
// InternalAmt here.
- if (CfaAdjustment) {
- BuildCFI(
- MBB, InsertPos, DL,
- MCCFIInstruction::createAdjustCfaOffset(nullptr, CfaAdjustment));
- }
+ BuildCFI(
+ MBB, InsertPos, DL,
+ MCCFIInstruction::createAdjustCfaOffset(nullptr, -CfaAdjustment));
}
return I;
diff --git a/llvm/lib/Target/X86/X86FrameLowering.h b/llvm/lib/Target/X86/X86FrameLowering.h
index 02fe8ee..ef41b46 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.h
+++ b/llvm/lib/Target/X86/X86FrameLowering.h
@@ -134,12 +134,50 @@ public:
processFunctionBeforeFrameIndicesReplaced(MachineFunction &MF,
RegScavenger *RS) const override;
- /// Check the instruction before/after the passed instruction. If
- /// it is an ADD/SUB/LEA instruction it is deleted argument and the
- /// stack adjustment is returned as a positive value for ADD/LEA and
- /// a negative for SUB.
- int mergeSPUpdates(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
- bool doMergeWithPrevious) const;
+private:
+ /// Basic Pseudocode:
+ /// if (instruction before/after the passed instruction is ADD/SUB/LEA)
+ /// Offset = instruction stack adjustment
+ /// ... positive value for ADD/LEA and negative for SUB
+ /// FoundStackAdjust(instruction, Offset)
+ /// erase(instruction)
+ /// return CalcNewOffset(Offset)
+ /// else
+ /// return CalcNewOffset(0)
+ ///
+ /// It's possible that the selected instruction is not immediately
+ /// before/after MBBI for large adjustments that have been split into multiple
+ /// instructions.
+ ///
+ /// FoundStackAdjust should have the signature:
+ /// void FoundStackAdjust(MachineBasicBlock::iterator PI, int64_t Offset)
+ /// CalcNewOffset should have the signature:
+ /// int64_t CalcNewOffset(int64_t Offset)
+ template <typename FoundT, typename CalcT>
+ int64_t mergeSPUpdates(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ FoundT FoundStackAdjust, CalcT CalcNewOffset,
+ bool doMergeWithPrevious) const;
+
+ template <typename CalcT>
+ int64_t mergeSPUpdates(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI, CalcT CalcNewOffset,
+ bool doMergeWithPrevious) const {
+ auto FoundStackAdjust = [](MachineBasicBlock::iterator MBBI,
+ int64_t Offset) {};
+ return mergeSPUpdates(MBB, MBBI, FoundStackAdjust, CalcNewOffset,
+ doMergeWithPrevious);
+ }
+
+public:
+ /// Equivalent to:
+ /// mergeSPUpdates(MBB, MBBI,
+ /// [AddOffset](int64_t Offset) {
+ /// return AddOffset + Offset;
+ /// },
+ /// doMergeWithPrevious);
+ int64_t mergeSPAdd(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+ int64_t AddOffset, bool doMergeWithPrevious) const;
/// Emit a series of instructions to increment / decrement the stack
/// pointer by a constant value.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 8f90420..6cf6061 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2944,7 +2944,7 @@ bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model CM,
}
/// Return true if the condition is an signed comparison operation.
-static bool isX86CCSigned(unsigned X86CC) {
+static bool isX86CCSigned(X86::CondCode X86CC) {
switch (X86CC) {
default:
llvm_unreachable("Invalid integer condition!");
@@ -22975,7 +22975,7 @@ static bool isProfitableToUseFlagOp(SDValue Op) {
/// Emit nodes that will be selected as "test Op0,Op0", or something
/// equivalent.
-static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
+static SDValue EmitTest(SDValue Op, X86::CondCode X86CC, const SDLoc &dl,
SelectionDAG &DAG, const X86Subtarget &Subtarget) {
// CF and OF aren't always set the way we want. Determine which
// of these we need.
@@ -23085,7 +23085,7 @@ static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
/// Emit nodes that will be selected as "cmp Op0,Op1", or something
/// equivalent.
-static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
+static SDValue EmitCmp(SDValue Op0, SDValue Op1, X86::CondCode X86CC,
const SDLoc &dl, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
if (isNullConstant(Op1))
@@ -23157,10 +23157,17 @@ static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
return Add.getValue(1);
}
- // Use SUB instead of CMP to enable CSE between SUB and CMP.
+ // If we already have an XOR of the ops, use that to check for equality.
+ // Else use SUB instead of CMP to enable CSE between SUB and CMP.
+ unsigned X86Opc = X86ISD::SUB;
+ if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) &&
+ (DAG.doesNodeExist(ISD::XOR, DAG.getVTList({CmpVT}), {Op0, Op1}) ||
+ DAG.doesNodeExist(ISD::XOR, DAG.getVTList({CmpVT}), {Op1, Op0})))
+ X86Opc = X86ISD::XOR;
+
SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
- SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
- return Sub.getValue(1);
+ SDValue CmpOp = DAG.getNode(X86Opc, dl, VTs, Op0, Op1);
+ return CmpOp.getValue(1);
}
bool X86TargetLowering::isXAndYEqZeroPreferableToXAndYEqY(ISD::CondCode Cond,