aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp264
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrFormats.td2
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.cpp95
-rw-r--r--llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp64
-rw-r--r--llvm/lib/Target/AArch64/AArch64Processors.td11
-rw-r--r--llvm/lib/Target/AArch64/AArch64Subtarget.h6
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp4
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.td13
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp12
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp87
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp26
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.h17
-rw-r--r--llvm/lib/Target/AMDGPU/SIDefines.h4
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp96
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp34
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.td3
-rw-r--r--llvm/lib/Target/AMDGPU/SOPInstructions.td7
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp4
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp34
-rw-r--r--llvm/lib/Target/AVR/AVRISelLowering.cpp7
-rw-r--r--llvm/lib/Target/DirectX/DXILForwardHandleAccesses.cpp37
-rw-r--r--llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp2
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp15
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchISelLowering.h2
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td27
-rw-r--r--llvm/lib/Target/Mips/MipsCCState.cpp5
-rw-r--r--llvm/lib/Target/Mips/MipsCCState.h8
-rw-r--r--llvm/lib/Target/Mips/MipsCallLowering.cpp2
-rw-r--r--llvm/lib/Target/Mips/MipsCallingConv.td8
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp150
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelLowering.h6
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXInstrInfo.td56
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXIntrinsics.td62
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp8
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h7
-rw-r--r--llvm/lib/Target/PowerPC/PPCCCState.h30
-rw-r--r--llvm/lib/Target/PowerPC/PPCISelLowering.cpp104
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrP10.td75
-rw-r--r--llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp10
-rw-r--r--llvm/lib/Target/RISCV/RISCVCallingConv.cpp14
-rw-r--r--llvm/lib/Target/RISCV/RISCVCallingConv.h6
-rw-r--r--llvm/lib/Target/RISCV/RISCVFeatures.td12
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp12
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoZb.td49
-rw-r--r--llvm/lib/Target/RISCV/RISCVMacroFusion.td56
-rw-r--r--llvm/lib/Target/RISCV/RISCVProcessors.td7
-rw-r--r--llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td163
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetMachine.cpp25
-rw-r--r--llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.h7
-rw-r--r--llvm/lib/Target/SPIRV/CMakeLists.txt1
-rw-r--r--llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp3
-rw-r--r--llvm/lib/Target/SPIRV/SPIRV.h2
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVAPI.cpp2
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp7
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp3
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp19
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVEmitNonSemanticDI.cpp1
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp14
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp3
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp159
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp6
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h7
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp2
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp1
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp2
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp1
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVUtils.cpp8
-rw-r--r--llvm/lib/Target/Sparc/SparcISelLowering.cpp2
-rw-r--r--llvm/lib/Target/SystemZ/SystemZCallingConv.h15
-rw-r--r--llvm/lib/Target/SystemZ/SystemZCallingConv.td38
-rw-r--r--llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp4
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp4
-rw-r--r--llvm/lib/Target/X86/GISel/X86CallLowering.cpp5
-rw-r--r--llvm/lib/Target/Xtensa/Disassembler/XtensaDisassembler.cpp67
-rw-r--r--llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp3
-rw-r--r--llvm/lib/Target/Xtensa/XtensaFeatures.td16
-rw-r--r--llvm/lib/Target/Xtensa/XtensaISelLowering.cpp31
-rw-r--r--llvm/lib/Target/Xtensa/XtensaISelLowering.h6
-rw-r--r--llvm/lib/Target/Xtensa/XtensaInstrInfo.td42
-rw-r--r--llvm/lib/Target/Xtensa/XtensaRegisterInfo.td10
-rw-r--r--llvm/lib/Target/Xtensa/XtensaSubtarget.h2
-rw-r--r--llvm/lib/Target/Xtensa/XtensaTargetMachine.cpp6
84 files changed, 1623 insertions, 634 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2b6ea86..3c06c6a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -8537,7 +8537,7 @@ static void analyzeCallOperands(const AArch64TargetLowering &TLI,
if (IsCalleeWin64) {
UseVarArgCC = true;
} else {
- UseVarArgCC = !Outs[i].IsFixed;
+ UseVarArgCC = ArgFlags.isVarArg();
}
}
@@ -8982,7 +8982,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
unsigned NumArgs = Outs.size();
for (unsigned i = 0; i != NumArgs; ++i) {
- if (!Outs[i].IsFixed && Outs[i].VT.isScalableVector())
+ if (Outs[i].Flags.isVarArg() && Outs[i].VT.isScalableVector())
report_fatal_error("Passing SVE types to variadic functions is "
"currently not supported");
}
@@ -11390,13 +11390,18 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(
// select_cc lhs, rhs, sub(rhs, lhs), sub(lhs, rhs), cc ->
// select_cc lhs, rhs, neg(sub(lhs, rhs)), sub(lhs, rhs), cc
// The second forms can be matched into subs+cneg.
+ // NOTE: Drop poison generating flags from the negated operand to avoid
+ // inadvertently propagating poison after the canonicalisation.
if (TVal.getOpcode() == ISD::SUB && FVal.getOpcode() == ISD::SUB) {
if (TVal.getOperand(0) == LHS && TVal.getOperand(1) == RHS &&
- FVal.getOperand(0) == RHS && FVal.getOperand(1) == LHS)
+ FVal.getOperand(0) == RHS && FVal.getOperand(1) == LHS) {
+ TVal->dropFlags(SDNodeFlags::PoisonGeneratingFlags);
FVal = DAG.getNegative(TVal, DL, TVal.getValueType());
- else if (TVal.getOperand(0) == RHS && TVal.getOperand(1) == LHS &&
- FVal.getOperand(0) == LHS && FVal.getOperand(1) == RHS)
+ } else if (TVal.getOperand(0) == RHS && TVal.getOperand(1) == LHS &&
+ FVal.getOperand(0) == LHS && FVal.getOperand(1) == RHS) {
+ FVal->dropFlags(SDNodeFlags::PoisonGeneratingFlags);
TVal = DAG.getNegative(FVal, DL, FVal.getValueType());
+ }
}
unsigned Opcode = AArch64ISD::CSEL;
@@ -13477,7 +13482,7 @@ static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
// Look for the first non-undef element.
const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
- // Benefit form APInt to handle overflow when calculating expected element.
+ // Benefit from APInt to handle overflow when calculating expected element.
unsigned NumElts = VT.getVectorNumElements();
unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1, /*isSigned=*/false,
@@ -13485,7 +13490,7 @@ static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
// The following shuffle indices must be the successive elements after the
// first real element.
bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) {
- return Elt != ExpectedElt++ && Elt != -1;
+ return Elt != ExpectedElt++ && Elt >= 0;
});
if (FoundWrongElt)
return false;
@@ -14737,6 +14742,106 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
return ResultSLI;
}
+static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+ const AArch64TargetLowering &TLI) {
+ EVT VT = N->getValueType(0);
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc DL(N);
+ const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
+
+ if (!VT.isVector())
+ return SDValue();
+
+ if (VT.isScalableVector() && !Subtarget.hasSVE2())
+ return SDValue();
+
+ if (VT.isFixedLengthVector() &&
+ (!Subtarget.isNeonAvailable() || TLI.useSVEForFixedLengthVectorVT(VT)))
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ if (N0.getOpcode() != ISD::AND)
+ return SDValue();
+
+ SDValue N1 = N->getOperand(1);
+ if (N1.getOpcode() != ISD::AND)
+ return SDValue();
+
+ // InstCombine does (not (neg a)) => (add a -1).
+ // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
+ // Loop over all combinations of AND operands.
+ for (int i = 1; i >= 0; --i) {
+ for (int j = 1; j >= 0; --j) {
+ SDValue O0 = N0->getOperand(i);
+ SDValue O1 = N1->getOperand(j);
+ SDValue Sub, Add, SubSibling, AddSibling;
+
+ // Find a SUB and an ADD operand, one from each AND.
+ if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
+ Sub = O0;
+ Add = O1;
+ SubSibling = N0->getOperand(1 - i);
+ AddSibling = N1->getOperand(1 - j);
+ } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
+ Add = O0;
+ Sub = O1;
+ AddSibling = N0->getOperand(1 - i);
+ SubSibling = N1->getOperand(1 - j);
+ } else
+ continue;
+
+ if (!ISD::isConstantSplatVectorAllZeros(Sub.getOperand(0).getNode()))
+ continue;
+
+ // Constant ones is always righthand operand of the Add.
+ if (!ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode()))
+ continue;
+
+ if (Sub.getOperand(1) != Add.getOperand(0))
+ continue;
+
+ return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
+ }
+ }
+
+ // (or (and a b) (and (not a) c)) => (bsl a b c)
+ // We only have to look for constant vectors here since the general, variable
+ // case can be handled in TableGen.
+ unsigned Bits = VT.getScalarSizeInBits();
+ uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
+ for (int i = 1; i >= 0; --i)
+ for (int j = 1; j >= 0; --j) {
+ APInt Val1, Val2;
+
+ if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) &&
+ ISD::isConstantSplatVector(N1->getOperand(j).getNode(), Val2) &&
+ (BitMask & ~Val1.getZExtValue()) == Val2.getZExtValue()) {
+ return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
+ N0->getOperand(1 - i), N1->getOperand(1 - j));
+ }
+ BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
+ BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
+ if (!BVN0 || !BVN1)
+ continue;
+
+ bool FoundMatch = true;
+ for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
+ ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
+ ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
+ if (!CN0 || !CN1 ||
+ CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
+ FoundMatch = false;
+ break;
+ }
+ }
+ if (FoundMatch)
+ return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
+ N0->getOperand(1 - i), N1->getOperand(1 - j));
+ }
+
+ return SDValue();
+}
+
SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
SelectionDAG &DAG) const {
if (useSVEForFixedLengthVectorVT(Op.getValueType(),
@@ -15772,6 +15877,7 @@ bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
isREVMask(M, EltSize, NumElts, 32) ||
isREVMask(M, EltSize, NumElts, 16) ||
isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
+ isSingletonEXTMask(M, VT, DummyUnsigned) ||
isTRNMask(M, NumElts, DummyUnsigned) ||
isUZPMask(M, NumElts, DummyUnsigned) ||
isZIPMask(M, NumElts, DummyUnsigned) ||
@@ -16284,9 +16390,8 @@ AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
Chain = SP.getValue(1);
SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size);
if (Align)
- SP =
- DAG.getNode(ISD::AND, DL, VT, SP.getValue(0),
- DAG.getSignedConstant(-(uint64_t)Align->value(), DL, VT));
+ SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0),
+ DAG.getSignedConstant(-Align->value(), DL, VT));
Chain = DAG.getCopyToReg(Chain, DL, AArch64::SP, SP);
SDValue Ops[2] = {SP, Chain};
return DAG.getMergeValues(Ops, DL);
@@ -16323,7 +16428,7 @@ AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size);
if (Align)
SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0),
- DAG.getSignedConstant(-(uint64_t)Align->value(), DL, VT));
+ DAG.getSignedConstant(-Align->value(), DL, VT));
Chain = DAG.getCopyToReg(Chain, DL, AArch64::SP, SP);
Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), DL);
@@ -16351,7 +16456,7 @@ AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size);
if (Align)
SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0),
- DAG.getSignedConstant(-(uint64_t)Align->value(), DL, VT));
+ DAG.getSignedConstant(-Align->value(), DL, VT));
// Set the real SP to the new value with a probing loop.
Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, DL, MVT::Other, Chain, SP);
@@ -19414,106 +19519,6 @@ static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
return FixConv;
}
-static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
- const AArch64TargetLowering &TLI) {
- EVT VT = N->getValueType(0);
- SelectionDAG &DAG = DCI.DAG;
- SDLoc DL(N);
- const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
-
- if (!VT.isVector())
- return SDValue();
-
- if (VT.isScalableVector() && !Subtarget.hasSVE2())
- return SDValue();
-
- if (VT.isFixedLengthVector() &&
- (!Subtarget.isNeonAvailable() || TLI.useSVEForFixedLengthVectorVT(VT)))
- return SDValue();
-
- SDValue N0 = N->getOperand(0);
- if (N0.getOpcode() != ISD::AND)
- return SDValue();
-
- SDValue N1 = N->getOperand(1);
- if (N1.getOpcode() != ISD::AND)
- return SDValue();
-
- // InstCombine does (not (neg a)) => (add a -1).
- // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
- // Loop over all combinations of AND operands.
- for (int i = 1; i >= 0; --i) {
- for (int j = 1; j >= 0; --j) {
- SDValue O0 = N0->getOperand(i);
- SDValue O1 = N1->getOperand(j);
- SDValue Sub, Add, SubSibling, AddSibling;
-
- // Find a SUB and an ADD operand, one from each AND.
- if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
- Sub = O0;
- Add = O1;
- SubSibling = N0->getOperand(1 - i);
- AddSibling = N1->getOperand(1 - j);
- } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
- Add = O0;
- Sub = O1;
- AddSibling = N0->getOperand(1 - i);
- SubSibling = N1->getOperand(1 - j);
- } else
- continue;
-
- if (!ISD::isConstantSplatVectorAllZeros(Sub.getOperand(0).getNode()))
- continue;
-
- // Constant ones is always righthand operand of the Add.
- if (!ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode()))
- continue;
-
- if (Sub.getOperand(1) != Add.getOperand(0))
- continue;
-
- return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
- }
- }
-
- // (or (and a b) (and (not a) c)) => (bsl a b c)
- // We only have to look for constant vectors here since the general, variable
- // case can be handled in TableGen.
- unsigned Bits = VT.getScalarSizeInBits();
- uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
- for (int i = 1; i >= 0; --i)
- for (int j = 1; j >= 0; --j) {
- APInt Val1, Val2;
-
- if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) &&
- ISD::isConstantSplatVector(N1->getOperand(j).getNode(), Val2) &&
- (BitMask & ~Val1.getZExtValue()) == Val2.getZExtValue()) {
- return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
- N0->getOperand(1 - i), N1->getOperand(1 - j));
- }
- BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
- BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
- if (!BVN0 || !BVN1)
- continue;
-
- bool FoundMatch = true;
- for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
- ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
- ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
- if (!CN0 || !CN1 ||
- CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
- FoundMatch = false;
- break;
- }
- }
- if (FoundMatch)
- return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
- N0->getOperand(1 - i), N1->getOperand(1 - j));
- }
-
- return SDValue();
-}
-
// Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
// convert to csel(ccmp(.., cc0)), depending on cc1:
@@ -25450,6 +25455,29 @@ static SDValue performCSELCombine(SDNode *N,
}
}
+ // CSEL a, b, cc, SUBS(SUB(x,y), 0) -> CSEL a, b, cc, SUBS(x,y) if cc doesn't
+ // use overflow flags, to avoid the comparison with zero. In case of success,
+ // this also replaces the original SUB(x,y) with the newly created SUBS(x,y).
+ // NOTE: Perhaps in the future use performFlagSettingCombine to replace SUB
+ // nodes with their SUBS equivalent as is already done for other flag-setting
+ // operators, in which case doing the replacement here becomes redundant.
+ if (Cond.getOpcode() == AArch64ISD::SUBS && Cond->hasNUsesOfValue(1, 1) &&
+ isNullConstant(Cond.getOperand(1))) {
+ SDValue Sub = Cond.getOperand(0);
+ AArch64CC::CondCode CC =
+ static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2));
+ if (Sub.getOpcode() == ISD::SUB &&
+ (CC == AArch64CC::EQ || CC == AArch64CC::NE || CC == AArch64CC::MI ||
+ CC == AArch64CC::PL)) {
+ SDLoc DL(N);
+ SDValue Subs = DAG.getNode(AArch64ISD::SUBS, DL, Cond->getVTList(),
+ Sub.getOperand(0), Sub.getOperand(1));
+ DCI.CombineTo(Sub.getNode(), Subs);
+ DCI.CombineTo(Cond.getNode(), Subs, Subs.getValue(1));
+ return SDValue(N, 0);
+ }
+ }
+
// CSEL (LASTB P, Z), X, NE(ANY P) -> CLASTB P, X, Z
if (SDValue CondLast = foldCSELofLASTB(N, DAG))
return CondLast;
@@ -28609,14 +28637,16 @@ Value *AArch64TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
void AArch64TargetLowering::insertSSPDeclarations(Module &M) const {
// MSVC CRT provides functionalities for stack protection.
- if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) {
+ RTLIB::LibcallImpl SecurityCheckCookieLibcall =
+ getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
+ if (SecurityCheckCookieLibcall != RTLIB::Unsupported) {
// MSVC CRT has a global variable holding security cookie.
M.getOrInsertGlobal("__security_cookie",
PointerType::getUnqual(M.getContext()));
// MSVC CRT has a function to validate security cookie.
FunctionCallee SecurityCheckCookie =
- M.getOrInsertFunction(Subtarget->getSecurityCheckCookieName(),
+ M.getOrInsertFunction(getLibcallImplName(SecurityCheckCookieLibcall),
Type::getVoidTy(M.getContext()),
PointerType::getUnqual(M.getContext()));
if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
@@ -28637,8 +28667,10 @@ Value *AArch64TargetLowering::getSDagStackGuard(const Module &M) const {
Function *AArch64TargetLowering::getSSPStackGuardCheck(const Module &M) const {
// MSVC CRT has a function to validate security cookie.
- if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
- return M.getFunction(Subtarget->getSecurityCheckCookieName());
+ RTLIB::LibcallImpl SecurityCheckCookieLibcall =
+ getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
+ if (SecurityCheckCookieLibcall != RTLIB::Unsupported)
+ return M.getFunction(getLibcallImplName(SecurityCheckCookieLibcall));
return TargetLowering::getSSPStackGuardCheck(M);
}
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 5a537f2..d068a12 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -12564,7 +12564,7 @@ multiclass STOPregister<string asm, string instr> {
let Predicates = [HasLSUI] in
class BaseSTOPregisterLSUI<string asm, RegisterClass OP, Register Reg,
Instruction inst> :
- InstAlias<asm # "\t$Rs, [$Rn]", (inst Reg, OP:$Rs, GPR64sp:$Rn), 0>;
+ InstAlias<asm # "\t$Rs, [$Rn]", (inst Reg, OP:$Rs, GPR64sp:$Rn)>;
multiclass STOPregisterLSUI<string asm, string instr> {
def : BaseSTOPregisterLSUI<asm # "l", GPR32, WZR,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 59d4fd2..fb59c9f 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -5861,33 +5861,41 @@ void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
}
}
-// Convenience function to create a DWARF expression for
-// Expr + NumBytes + NumVGScaledBytes * AArch64::VG
-static void appendVGScaledOffsetExpr(SmallVectorImpl<char> &Expr, int NumBytes,
- int NumVGScaledBytes, unsigned VG,
- llvm::raw_string_ostream &Comment) {
- uint8_t buffer[16];
-
- if (NumBytes) {
+// Convenience function to create a DWARF expression for: Constant `Operation`.
+// This helper emits compact sequences for common cases. For example, for`-15
+// DW_OP_plus`, this helper would create DW_OP_lit15 DW_OP_minus.
+static void appendConstantExpr(SmallVectorImpl<char> &Expr, int64_t Constant,
+ dwarf::LocationAtom Operation) {
+ if (Operation == dwarf::DW_OP_plus && Constant < 0 && -Constant <= 31) {
+ // -Constant (1 to 31)
+ Expr.push_back(dwarf::DW_OP_lit0 - Constant);
+ Operation = dwarf::DW_OP_minus;
+ } else if (Constant >= 0 && Constant <= 31) {
+ // Literal value 0 to 31
+ Expr.push_back(dwarf::DW_OP_lit0 + Constant);
+ } else {
+ // Signed constant
Expr.push_back(dwarf::DW_OP_consts);
- Expr.append(buffer, buffer + encodeSLEB128(NumBytes, buffer));
- Expr.push_back((uint8_t)dwarf::DW_OP_plus);
- Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes);
+ appendLEB128<LEB128Sign::Signed>(Expr, Constant);
}
+ return Expr.push_back(Operation);
+}
- if (NumVGScaledBytes) {
- Expr.push_back((uint8_t)dwarf::DW_OP_consts);
- Expr.append(buffer, buffer + encodeSLEB128(NumVGScaledBytes, buffer));
-
- Expr.push_back((uint8_t)dwarf::DW_OP_bregx);
- Expr.append(buffer, buffer + encodeULEB128(VG, buffer));
- Expr.push_back(0);
-
- Expr.push_back((uint8_t)dwarf::DW_OP_mul);
- Expr.push_back((uint8_t)dwarf::DW_OP_plus);
+// Convenience function to create a DWARF expression for a register.
+static void appendReadRegExpr(SmallVectorImpl<char> &Expr, unsigned RegNum) {
+ Expr.push_back((char)dwarf::DW_OP_bregx);
+ appendLEB128<LEB128Sign::Unsigned>(Expr, RegNum);
+ Expr.push_back(0);
+}
- Comment << (NumVGScaledBytes < 0 ? " - " : " + ")
- << std::abs(NumVGScaledBytes) << " * VG";
+// Convenience function to create a comment for
+// (+/-) NumBytes (* RegScale)?
+static void appendOffsetComment(int NumBytes, llvm::raw_string_ostream &Comment,
+ StringRef RegScale = {}) {
+ if (NumBytes) {
+ Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes);
+ if (!RegScale.empty())
+ Comment << ' ' << RegScale;
}
}
@@ -5909,19 +5917,26 @@ static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI,
else
Comment << printReg(Reg, &TRI);
- // Build up the expression (Reg + NumBytes + NumVGScaledBytes * AArch64::VG)
+ // Build up the expression (Reg + NumBytes + VG * NumVGScaledBytes)
SmallString<64> Expr;
unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
- Expr.push_back((uint8_t)(dwarf::DW_OP_breg0 + DwarfReg));
- Expr.push_back(0);
- appendVGScaledOffsetExpr(Expr, NumBytes, NumVGScaledBytes,
- TRI.getDwarfRegNum(AArch64::VG, true), Comment);
+ assert(DwarfReg >= 0 && DwarfReg <= 31 && "DwarfReg out of bounds (0..31)");
+ // Reg + NumBytes
+ Expr.push_back(dwarf::DW_OP_breg0 + DwarfReg);
+ appendLEB128<LEB128Sign::Signed>(Expr, NumBytes);
+ appendOffsetComment(NumBytes, Comment);
+ if (NumVGScaledBytes) {
+ // + VG * NumVGScaledBytes
+ appendOffsetComment(NumVGScaledBytes, Comment, "* VG");
+ appendReadRegExpr(Expr, TRI.getDwarfRegNum(AArch64::VG, true));
+ appendConstantExpr(Expr, NumVGScaledBytes, dwarf::DW_OP_mul);
+ Expr.push_back(dwarf::DW_OP_plus);
+ }
// Wrap this into DW_CFA_def_cfa.
SmallString<64> DefCfaExpr;
DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression);
- uint8_t buffer[16];
- DefCfaExpr.append(buffer, buffer + encodeULEB128(Expr.size(), buffer));
+ appendLEB128<LEB128Sign::Unsigned>(DefCfaExpr, Expr.size());
DefCfaExpr.append(Expr.str());
return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(), SMLoc(),
Comment.str());
@@ -5958,17 +5973,25 @@ MCCFIInstruction llvm::createCFAOffset(const TargetRegisterInfo &TRI,
llvm::raw_string_ostream Comment(CommentBuffer);
Comment << printReg(Reg, &TRI) << " @ cfa";
- // Build up expression (NumBytes + NumVGScaledBytes * AArch64::VG)
+ // Build up expression (CFA + VG * NumVGScaledBytes + NumBytes)
+ assert(NumVGScaledBytes && "Expected scalable offset");
SmallString<64> OffsetExpr;
- appendVGScaledOffsetExpr(OffsetExpr, NumBytes, NumVGScaledBytes,
- TRI.getDwarfRegNum(AArch64::VG, true), Comment);
+ // + VG * NumVGScaledBytes
+ appendOffsetComment(NumVGScaledBytes, Comment, "* VG");
+ appendReadRegExpr(OffsetExpr, TRI.getDwarfRegNum(AArch64::VG, true));
+ appendConstantExpr(OffsetExpr, NumVGScaledBytes, dwarf::DW_OP_mul);
+ OffsetExpr.push_back(dwarf::DW_OP_plus);
+ if (NumBytes) {
+ // + NumBytes
+ appendOffsetComment(NumBytes, Comment);
+ appendConstantExpr(OffsetExpr, NumBytes, dwarf::DW_OP_plus);
+ }
// Wrap this into DW_CFA_expression
SmallString<64> CfaExpr;
CfaExpr.push_back(dwarf::DW_CFA_expression);
- uint8_t buffer[16];
- CfaExpr.append(buffer, buffer + encodeULEB128(DwarfReg, buffer));
- CfaExpr.append(buffer, buffer + encodeULEB128(OffsetExpr.size(), buffer));
+ appendLEB128<LEB128Sign::Unsigned>(CfaExpr, DwarfReg);
+ appendLEB128<LEB128Sign::Unsigned>(CfaExpr, OffsetExpr.size());
CfaExpr.append(OffsetExpr.str());
return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), SMLoc(),
diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
index b97d622..fd4ef2a 100644
--- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
@@ -8,8 +8,8 @@
//
// This pass performs below peephole optimizations on MIR level.
//
-// 1. MOVi32imm + ANDS?Wrr ==> ANDWri + ANDS?Wri
-// MOVi64imm + ANDS?Xrr ==> ANDXri + ANDS?Xri
+// 1. MOVi32imm + (ANDS?|EOR|ORR)Wrr ==> (AND|EOR|ORR)Wri + (ANDS?|EOR|ORR)Wri
+// MOVi64imm + (ANDS?|EOR|ORR)Xrr ==> (AND|EOR|ORR)Xri + (ANDS?|EOR|ORR)Xri
//
// 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi
// MOVi64imm + ADDXrr ==> ADDXri + ADDXri
@@ -128,6 +128,7 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass {
// Strategy used to split logical immediate bitmasks.
enum class SplitStrategy {
Intersect,
+ Disjoint,
};
template <typename T>
bool trySplitLogicalImm(unsigned Opc, MachineInstr &MI,
@@ -163,6 +164,7 @@ INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt",
template <typename T>
static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
T UImm = static_cast<T>(Imm);
+ assert(UImm && (UImm != ~static_cast<T>(0)) && "Invalid immediate!");
// The bitmask immediate consists of consecutive ones. Let's say there is
// constant 0b00000000001000000000010000000000 which does not consist of
@@ -191,18 +193,47 @@ static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
}
template <typename T>
+static bool splitDisjointBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc,
+ T &Imm2Enc) {
+ assert(Imm && (Imm != ~static_cast<T>(0)) && "Invalid immediate!");
+
+ // Try to split a bitmask of the form 0b00000000011000000000011110000000 into
+ // two disjoint masks such as 0b00000000011000000000000000000000 and
+ // 0b00000000000000000000011110000000 where the inclusive/exclusive OR of the
+ // new masks match the original mask.
+ unsigned LowestBitSet = llvm::countr_zero(Imm);
+ unsigned LowestGapBitUnset =
+ LowestBitSet + llvm::countr_one(Imm >> LowestBitSet);
+
+ // Create a mask for the least significant group of consecutive ones.
+ assert(LowestGapBitUnset < sizeof(T) * CHAR_BIT && "Undefined behaviour!");
+ T NewImm1 = (static_cast<T>(1) << LowestGapBitUnset) -
+ (static_cast<T>(1) << LowestBitSet);
+ // Create a disjoint mask for the remaining ones.
+ T NewImm2 = Imm & ~NewImm1;
+
+ // Do not split if NewImm2 is not a valid bitmask immediate.
+ if (!AArch64_AM::isLogicalImmediate(NewImm2, RegSize))
+ return false;
+
+ Imm1Enc = AArch64_AM::encodeLogicalImmediate(NewImm1, RegSize);
+ Imm2Enc = AArch64_AM::encodeLogicalImmediate(NewImm2, RegSize);
+ return true;
+}
+
+template <typename T>
bool AArch64MIPeepholeOpt::trySplitLogicalImm(unsigned Opc, MachineInstr &MI,
SplitStrategy Strategy,
unsigned OtherOpc) {
- // Try below transformation.
+ // Try below transformations.
//
- // MOVi32imm + ANDS?Wrr ==> ANDWri + ANDS?Wri
- // MOVi64imm + ANDS?Xrr ==> ANDXri + ANDS?Xri
+ // MOVi32imm + (ANDS?|EOR|ORR)Wrr ==> (AND|EOR|ORR)Wri + (ANDS?|EOR|ORR)Wri
+ // MOVi64imm + (ANDS?|EOR|ORR)Xrr ==> (AND|EOR|ORR)Xri + (ANDS?|EOR|ORR)Xri
//
// The mov pseudo instruction could be expanded to multiple mov instructions
// later. Let's try to split the constant operand of mov instruction into two
- // bitmask immediates. It makes only two AND instructions instead of multiple
- // mov + and instructions.
+ // bitmask immediates based on the given split strategy. It makes only two
+ // logical instructions instead of multiple mov + logic instructions.
return splitTwoPartImm<T>(
MI,
@@ -224,6 +255,9 @@ bool AArch64MIPeepholeOpt::trySplitLogicalImm(unsigned Opc, MachineInstr &MI,
case SplitStrategy::Intersect:
SplitSucc = splitBitmaskImm(Imm, RegSize, Imm0, Imm1);
break;
+ case SplitStrategy::Disjoint:
+ SplitSucc = splitDisjointBitmaskImm(Imm, RegSize, Imm0, Imm1);
+ break;
}
if (SplitSucc)
return std::make_pair(Opc, !OtherOpc ? Opc : OtherOpc);
@@ -889,6 +923,22 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
Changed |= trySplitLogicalImm<uint64_t>(
AArch64::ANDXri, MI, SplitStrategy::Intersect, AArch64::ANDSXri);
break;
+ case AArch64::EORWrr:
+ Changed |= trySplitLogicalImm<uint32_t>(AArch64::EORWri, MI,
+ SplitStrategy::Disjoint);
+ break;
+ case AArch64::EORXrr:
+ Changed |= trySplitLogicalImm<uint64_t>(AArch64::EORXri, MI,
+ SplitStrategy::Disjoint);
+ break;
+ case AArch64::ORRWrr:
+ Changed |= trySplitLogicalImm<uint32_t>(AArch64::ORRWri, MI,
+ SplitStrategy::Disjoint);
+ break;
+ case AArch64::ORRXrr:
+ Changed |= trySplitLogicalImm<uint64_t>(AArch64::ORRXri, MI,
+ SplitStrategy::Disjoint);
+ break;
case AArch64::ORRWrs:
Changed |= visitORR(MI);
break;
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index adc984a..1bc1d98 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -22,7 +22,8 @@ def TuneA320 : SubtargetFeature<"a320", "ARMProcFamily", "CortexA320",
FeatureFuseAES,
FeatureFuseAdrpAdd,
FeaturePostRAScheduler,
- FeatureUseWzrToVecMove]>;
+ FeatureUseWzrToVecMove,
+ FeatureUseFixedOverScalableIfEqualCost]>;
def TuneA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
"Cortex-A53 ARM processors", [
@@ -45,7 +46,8 @@ def TuneA510 : SubtargetFeature<"a510", "ARMProcFamily", "CortexA510",
FeatureFuseAES,
FeatureFuseAdrpAdd,
FeaturePostRAScheduler,
- FeatureUseWzrToVecMove
+ FeatureUseWzrToVecMove,
+ FeatureUseFixedOverScalableIfEqualCost
]>;
def TuneA520 : SubtargetFeature<"a520", "ARMProcFamily", "CortexA520",
@@ -53,7 +55,8 @@ def TuneA520 : SubtargetFeature<"a520", "ARMProcFamily", "CortexA520",
FeatureFuseAES,
FeatureFuseAdrpAdd,
FeaturePostRAScheduler,
- FeatureUseWzrToVecMove]>;
+ FeatureUseWzrToVecMove,
+ FeatureUseFixedOverScalableIfEqualCost]>;
def TuneA520AE : SubtargetFeature<"a520ae", "ARMProcFamily", "CortexA520",
"Cortex-A520AE ARM processors", [
@@ -756,7 +759,6 @@ def ProcessorFeatures {
FeatureSB, FeaturePAuth, FeatureSSBS, FeatureSVE, FeatureSVE2,
FeatureComplxNum, FeatureCRC, FeatureDotProd,
FeatureFPARMv8,FeatureFullFP16, FeatureJS, FeatureLSE,
- FeatureUseFixedOverScalableIfEqualCost,
FeatureRAS, FeatureRCPC, FeatureRDM, FeatureFPAC];
list<SubtargetFeature> A520 = [HasV9_2aOps, FeaturePerfMon, FeatureAM,
FeatureMTE, FeatureETE, FeatureSVEBitPerm,
@@ -766,7 +768,6 @@ def ProcessorFeatures {
FeatureSVE, FeatureSVE2, FeatureBF16, FeatureComplxNum, FeatureCRC,
FeatureFPARMv8, FeatureFullFP16, FeatureMatMulInt8, FeatureJS,
FeatureNEON, FeatureLSE, FeatureRAS, FeatureRCPC, FeatureRDM,
- FeatureUseFixedOverScalableIfEqualCost,
FeatureDotProd, FeatureFPAC];
list<SubtargetFeature> A520AE = [HasV9_2aOps, FeaturePerfMon, FeatureAM,
FeatureMTE, FeatureETE, FeatureSVEBitPerm,
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 061ed61..d00e447 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -451,12 +451,6 @@ public:
return "__chkstk";
}
- const char* getSecurityCheckCookieName() const {
- if (isWindowsArm64EC())
- return "#__security_check_cookie_arm64ec";
- return "__security_check_cookie";
- }
-
/// Choose a method of checking LR before performing a tail call.
AArch64PAuth::AuthCheckMethod
getAuthenticatedLRCheckMethod(const MachineFunction &MF) const;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
index 010d0aaa..2155ace 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
@@ -125,7 +125,7 @@ struct AArch64OutgoingValueAssigner
bool UseVarArgsCCForFixed = IsCalleeWin && State.isVarArg();
bool Res;
- if (Info.IsFixed && !UseVarArgsCCForFixed) {
+ if (!Flags.isVarArg() && !UseVarArgsCCForFixed) {
if (!IsReturn)
applyStackPassedSmallTypeDAGHack(OrigVT, ValVT, LocVT);
Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State);
@@ -361,7 +361,7 @@ struct OutgoingArgHandler : public CallLowering::OutgoingValueHandler {
unsigned MaxSize = MemTy.getSizeInBytes() * 8;
// For varargs, we always want to extend them to 8 bytes, in which case
// we disable setting a max.
- if (!Arg.IsFixed)
+ if (Arg.Flags[0].isVarArg())
MaxSize = 0;
Register ValVReg = Arg.Regs[RegIndex];
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index d905692..f359731 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -1697,7 +1697,7 @@ bool AArch64InstructionSelector::selectCompareBranchFedByFCmp(
emitFPCompare(FCmp.getOperand(2).getReg(), FCmp.getOperand(3).getReg(), MIB,
Pred);
AArch64CC::CondCode CC1, CC2;
- changeFCMPPredToAArch64CC(static_cast<CmpInst::Predicate>(Pred), CC1, CC2);
+ changeFCMPPredToAArch64CC(Pred, CC1, CC2);
MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC1).addMBB(DestMBB);
if (CC2 != AArch64CC::AL)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index d84f512..f266398 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1013,6 +1013,14 @@ def FeatureAgentScopeFineGrainedRemoteMemoryAtomics
"device memory."
>;
+def FeatureEmulatedSystemScopeAtomics
+ : SubtargetFeature<"emulated-system-scope-atomics",
+ "HasEmulatedSystemScopeAtomics",
+ "true",
+ "System scope atomics unsupported by the PCI-e are emulated in HW via CAS "
+ "loop and functional."
+>;
+
def FeatureDefaultComponentZero : SubtargetFeature<"default-component-zero",
"HasDefaultComponentZero",
"true",
@@ -2062,6 +2070,7 @@ def FeatureISAVersion12_50 : FeatureSet<
FeatureAtomicFMinFMaxF64FlatInsts,
FeatureFlatBufferGlobalAtomicFaddF64Inst,
FeatureMemoryAtomicFAddF32DenormalSupport,
+ FeatureEmulatedSystemScopeAtomics,
FeatureGloballyAddressableScratch,
FeatureKernargPreload,
FeatureVmemPrefInsts,
@@ -2603,6 +2612,10 @@ def HasPkMinMax3Insts :
Predicate<"Subtarget->hasPkMinMax3Insts()">,
AssemblerPredicate<(any_of FeatureGFX1250Insts)>;
+def HasSGetShaderCyclesInst :
+ Predicate<"Subtarget->hasSGetShaderCyclesInst()">,
+ AssemblerPredicate<(any_of FeatureGFX1250Insts)>;
+
def HasImageInsts : Predicate<"Subtarget->hasImageInsts()">,
AssemblerPredicate<(all_of FeatureImageInsts)>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 6681393..2a324e5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -486,12 +486,16 @@ bool AMDGPUAsmPrinter::doFinalization(Module &M) {
// Pad with s_code_end to help tools and guard against instruction prefetch
// causing stale data in caches. Arguably this should be done by the linker,
// which is why this isn't done for Mesa.
+ // Don't do it if there is no code.
const MCSubtargetInfo &STI = *getGlobalSTI();
if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) &&
(STI.getTargetTriple().getOS() == Triple::AMDHSA ||
STI.getTargetTriple().getOS() == Triple::AMDPAL)) {
- OutStreamer->switchSection(getObjFileLowering().getTextSection());
- getTargetStreamer()->EmitCodeEnd(STI);
+ MCSection *TextSect = getObjFileLowering().getTextSection();
+ if (TextSect->hasInstructions()) {
+ OutStreamer->switchSection(TextSect);
+ getTargetStreamer()->EmitCodeEnd(STI);
+ }
}
// Assign expressions which can only be resolved when all other functions are
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 7771f9b..64e68ab 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -367,18 +367,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Expand);
setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
- setTruncStoreAction(MVT::v5i32, MVT::v5i1, Expand);
- setTruncStoreAction(MVT::v5i32, MVT::v5i8, Expand);
- setTruncStoreAction(MVT::v5i32, MVT::v5i16, Expand);
-
- setTruncStoreAction(MVT::v6i32, MVT::v6i1, Expand);
- setTruncStoreAction(MVT::v6i32, MVT::v6i8, Expand);
- setTruncStoreAction(MVT::v6i32, MVT::v6i16, Expand);
-
- setTruncStoreAction(MVT::v7i32, MVT::v7i1, Expand);
- setTruncStoreAction(MVT::v7i32, MVT::v7i8, Expand);
- setTruncStoreAction(MVT::v7i32, MVT::v7i16, Expand);
-
setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
setTruncStoreAction(MVT::v8f64, MVT::v8bf16, Expand);
setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 1fdf272..a6e4a63 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2271,6 +2271,9 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
? AMDGPU::SRC_SHARED_BASE
: AMDGPU::SRC_PRIVATE_BASE;
+ assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
+ !ST.hasGloballyAddressableScratch()) &&
+ "Cannot use src_private_base with globally addressable scratch!");
// FIXME: It would be more natural to emit a COPY here, but then copy
// coalescing would kick in and it would think it's okay to use the "HI"
// subregister (instead of extracting the HI 32 bits) which is an artificial
@@ -2396,11 +2399,30 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
+ auto castFlatToLocalOrPrivate = [&](const DstOp &Dst) -> Register {
+ if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
+ ST.hasGloballyAddressableScratch()) {
+ // flat -> private with globally addressable scratch: subtract
+ // src_flat_scratch_base_lo.
+ const LLT S32 = LLT::scalar(32);
+ Register SrcLo = B.buildExtract(S32, Src, 0).getReg(0);
+ Register FlatScratchBaseLo =
+ B.buildInstr(AMDGPU::S_MOV_B32, {S32},
+ {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
+ .getReg(0);
+ MRI.setRegClass(FlatScratchBaseLo, &AMDGPU::SReg_32RegClass);
+ Register Sub = B.buildSub(S32, SrcLo, FlatScratchBaseLo).getReg(0);
+ return B.buildIntToPtr(Dst, Sub).getReg(0);
+ }
+
+ // Extract low 32-bits of the pointer.
+ return B.buildExtract(Dst, Src, 0).getReg(0);
+ };
+
// For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
// G_ADDRSPACE_CAST we need to guess.
if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
- // Extract low 32-bits of the pointer.
- B.buildExtract(Dst, Src, 0);
+ castFlatToLocalOrPrivate(Dst);
MI.eraseFromParent();
return true;
}
@@ -2411,7 +2433,7 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
auto FlatNull = B.buildConstant(SrcTy, 0);
// Extract low 32-bits of the pointer.
- auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
+ auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);
auto CmpRes =
B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
@@ -2425,14 +2447,45 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
(SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
auto castLocalOrPrivateToFlat = [&](const DstOp &Dst) -> Register {
- Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
- if (!ApertureReg.isValid())
- return false;
-
// Coerce the type of the low half of the result so we can use
// merge_values.
Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
+ if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
+ ST.hasGloballyAddressableScratch()) {
+ // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
+ // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
+ Register AllOnes = B.buildConstant(S32, -1).getReg(0);
+ Register ThreadID = B.buildConstant(S32, 0).getReg(0);
+ ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {S32})
+ .addUse(AllOnes)
+ .addUse(ThreadID)
+ .getReg(0);
+ if (ST.isWave64()) {
+ ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {S32})
+ .addUse(AllOnes)
+ .addUse(ThreadID)
+ .getReg(0);
+ }
+ Register ShAmt =
+ B.buildConstant(S32, 57 - 32 - ST.getWavefrontSizeLog2()).getReg(0);
+ Register SrcHi = B.buildShl(S32, ThreadID, ShAmt).getReg(0);
+ Register CvtPtr =
+ B.buildMergeLikeInstr(DstTy, {SrcAsInt, SrcHi}).getReg(0);
+ // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
+ // 64-bit hi:lo value.
+ Register FlatScratchBase =
+ B.buildInstr(AMDGPU::S_MOV_B64, {S64},
+ {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
+ .getReg(0);
+ MRI.setRegClass(FlatScratchBase, &AMDGPU::SReg_64RegClass);
+ return B.buildPtrAdd(Dst, CvtPtr, FlatScratchBase).getReg(0);
+ }
+
+ Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
+ if (!ApertureReg.isValid())
+ return false;
+
// TODO: Should we allow mismatched types but matching sizes in merges to
// avoid the ptrtoint?
return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).getReg(0);
@@ -5788,11 +5841,25 @@ bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B,
unsigned AddrSpace) const {
- Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
- auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
+ const LLT S32 = LLT::scalar(32);
+ auto Unmerge = B.buildUnmerge(S32, MI.getOperand(2).getReg());
Register Hi32 = Unmerge.getReg(1);
- B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
+ if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS &&
+ ST.hasGloballyAddressableScratch()) {
+ Register FlatScratchBaseHi =
+ B.buildInstr(AMDGPU::S_MOV_B32, {S32},
+ {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
+ .getReg(0);
+ MRI.setRegClass(FlatScratchBaseHi, &AMDGPU::SReg_32RegClass);
+ // Test bits 63..58 against the aperture address.
+ Register XOR = B.buildXor(S32, Hi32, FlatScratchBaseHi).getReg(0);
+ B.buildICmp(ICmpInst::ICMP_ULT, MI.getOperand(0), XOR,
+ B.buildConstant(S32, 1u << 26));
+ } else {
+ Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
+ B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
+ }
MI.eraseFromParent();
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index a0c99b0..846a0b6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -991,10 +991,21 @@ bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
return true;
if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
- if (Intrinsic->getIntrinsicID() == Intrinsic::read_register)
+ Intrinsic::ID IID = Intrinsic->getIntrinsicID();
+ switch (IID) {
+ case Intrinsic::read_register:
return isReadRegisterSourceOfDivergence(Intrinsic);
-
- return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
+ case Intrinsic::amdgcn_addrspacecast_nonnull: {
+ unsigned SrcAS =
+ Intrinsic->getOperand(0)->getType()->getPointerAddressSpace();
+ unsigned DstAS = Intrinsic->getType()->getPointerAddressSpace();
+ return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
+ DstAS == AMDGPUAS::FLAT_ADDRESS &&
+ ST->hasGloballyAddressableScratch();
+ }
+ default:
+ return AMDGPU::isIntrinsicSourceOfDivergence(IID);
+ }
}
// Assume all function calls are a source of divergence.
@@ -1008,6 +1019,15 @@ bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
if (isa<InvokeInst>(V))
return true;
+ // If the target supports globally addressable scratch, the mapping from
+ // scratch memory to the flat aperture changes therefore an address space cast
+ // is no longer uniform.
+ if (auto *CastI = dyn_cast<AddrSpaceCastInst>(V)) {
+ return CastI->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS &&
+ CastI->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
+ ST->hasGloballyAddressableScratch();
+ }
+
return false;
}
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 5530886..f47ddf5 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -187,6 +187,7 @@ protected:
bool HasFlatBufferGlobalAtomicFaddF64Inst = false;
bool HasDefaultComponentZero = false;
bool HasAgentScopeFineGrainedRemoteMemoryAtomics = false;
+ bool HasEmulatedSystemScopeAtomics = false;
bool HasDefaultComponentBroadcast = false;
bool HasXF32Insts = false;
/// The maximum number of instructions that may be placed within an S_CLAUSE,
@@ -950,6 +951,12 @@ public:
return HasAgentScopeFineGrainedRemoteMemoryAtomics;
}
+ /// \return true is HW emulates system scope atomics unsupported by the PCI-e
+ /// via CAS loop.
+ bool hasEmulatedSystemScopeAtomics() const {
+ return HasEmulatedSystemScopeAtomics;
+ }
+
bool hasDefaultComponentZero() const { return HasDefaultComponentZero; }
bool hasDefaultComponentBroadcast() const {
@@ -1081,7 +1088,7 @@ public:
}
bool hasLDSFPAtomicAddF32() const { return GFX8Insts; }
- bool hasLDSFPAtomicAddF64() const { return GFX90AInsts; }
+ bool hasLDSFPAtomicAddF64() const { return GFX90AInsts || GFX1250Insts; }
/// \returns true if the subtarget has the v_permlanex16_b32 instruction.
bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
@@ -1555,12 +1562,16 @@ public:
// \returns true if the target has V_PK_{MIN|MAX}3_{I|U}16 instructions.
bool hasPkMinMax3Insts() const { return GFX1250Insts; }
+ // \returns ture if target has S_GET_SHADER_CYCLES_U64 instruction.
+ bool hasSGetShaderCyclesInst() const { return GFX1250Insts; }
+
// \returns true if target has S_SETPRIO_INC_WG instruction.
bool hasSetPrioIncWgInst() const { return HasSetPrioIncWgInst; }
// \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead
- // of sign-extending.
- bool hasGetPCZeroExtension() const { return GFX12Insts; }
+ // of sign-extending. Note that GFX1250 has not only fixed the bug but also
+ // extended VA to 57 bits.
+ bool hasGetPCZeroExtension() const { return GFX12Insts && !GFX1250Insts; }
/// \returns SGPR allocation granularity supported by the subtarget.
unsigned getSGPRAllocGranule() const {
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index deadb7a..2d0102f 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -536,6 +536,10 @@ enum Id { // HwRegCode, (6) [5:0]
ID_SQ_PERF_SNAPSHOT_DATA1 = 22,
ID_SQ_PERF_SNAPSHOT_PC_LO = 23,
ID_SQ_PERF_SNAPSHOT_PC_HI = 24,
+
+ // GFX1250
+ ID_XNACK_STATE_PRIV = 33,
+ ID_XNACK_MASK_gfx1250 = 34,
};
enum Offset : unsigned { // Offset, (5) [10:6]
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 4d67e4a..8f44c03 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2098,10 +2098,17 @@ bool SITargetLowering::isNonGlobalAddrSpace(unsigned AS) {
bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS,
unsigned DestAS) const {
- // Flat -> private/local is a simple truncate.
- // Flat -> global is no-op
- if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
+ if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
+ if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
+ Subtarget->hasGloballyAddressableScratch()) {
+ // Flat -> private requires subtracting src_flat_scratch_base_lo.
+ return false;
+ }
+
+ // Flat -> private/local is a simple truncate.
+ // Flat -> global is no-op
return true;
+ }
const GCNTargetMachine &TM =
static_cast<const GCNTargetMachine &>(getTargetMachine());
@@ -7650,6 +7657,9 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
? AMDGPU::SRC_SHARED_BASE
: AMDGPU::SRC_PRIVATE_BASE;
+ assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
+ !Subtarget->hasGloballyAddressableScratch()) &&
+ "Cannot use src_private_base with globally addressable scratch!");
// Note: this feature (register) is broken. When used as a 32-bit operand,
// it returns a wrong value (all zeroes?). The real value is in the upper 32
// bits.
@@ -7760,6 +7770,18 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
+ if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
+ Subtarget->hasGloballyAddressableScratch()) {
+ // flat -> private with globally addressable scratch: subtract
+ // src_flat_scratch_base_lo.
+ SDValue FlatScratchBaseLo(
+ DAG.getMachineNode(
+ AMDGPU::S_MOV_B32, SL, MVT::i32,
+ DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
+ 0);
+ Ptr = DAG.getNode(ISD::SUB, SL, MVT::i32, Ptr, FlatScratchBaseLo);
+ }
+
if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
return Ptr;
@@ -7776,11 +7798,40 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
-
- SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
- SDValue CvtPtr =
- DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
- CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
+ SDValue CvtPtr;
+ if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
+ Subtarget->hasGloballyAddressableScratch()) {
+ // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
+ // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
+ SDValue AllOnes = DAG.getSignedTargetConstant(-1, SL, MVT::i32);
+ SDValue ThreadID = DAG.getConstant(0, SL, MVT::i32);
+ ThreadID = DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
+ DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_lo, SL, MVT::i32),
+ AllOnes, ThreadID);
+ if (Subtarget->isWave64())
+ ThreadID = DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
+ DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_hi, SL, MVT::i32),
+ AllOnes, ThreadID);
+ SDValue ShAmt = DAG.getShiftAmountConstant(
+ 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
+ SDValue SrcHi = DAG.getNode(ISD::SHL, SL, MVT::i32, ThreadID, ShAmt);
+ CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, SrcHi);
+ CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
+ // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
+ // 64-bit hi:lo value.
+ SDValue FlatScratchBase = {
+ DAG.getMachineNode(
+ AMDGPU::S_MOV_B64, SL, MVT::i64,
+ DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
+ 0};
+ CvtPtr = DAG.getNode(ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
+ } else {
+ SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
+ CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
+ CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
+ }
if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
return CvtPtr;
@@ -9424,15 +9475,29 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::amdgcn_is_shared:
case Intrinsic::amdgcn_is_private: {
SDLoc SL(Op);
- unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
- ? AMDGPUAS::LOCAL_ADDRESS
- : AMDGPUAS::PRIVATE_ADDRESS;
- SDValue Aperture = getSegmentAperture(AS, SL, DAG);
SDValue SrcVec =
DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
-
SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
DAG.getConstant(1, SL, MVT::i32));
+
+ unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
+ ? AMDGPUAS::LOCAL_ADDRESS
+ : AMDGPUAS::PRIVATE_ADDRESS;
+ if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
+ Subtarget->hasGloballyAddressableScratch()) {
+ SDValue FlatScratchBaseHi(
+ DAG.getMachineNode(
+ AMDGPU::S_MOV_B32, DL, MVT::i32,
+ DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
+ 0);
+ // Test bits 63..58 against the aperture address.
+ return DAG.getSetCC(
+ SL, MVT::i1,
+ DAG.getNode(ISD::XOR, SL, MVT::i32, SrcHi, FlatScratchBaseHi),
+ DAG.getConstant(1u << 26, SL, MVT::i32), ISD::SETULT);
+ }
+
+ SDValue Aperture = getSegmentAperture(AS, SL, DAG);
return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
}
case Intrinsic::amdgcn_perm:
@@ -17630,6 +17695,8 @@ static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics() &&
RMW->hasMetadata("amdgpu.no.remote.memory"))
return true;
+ if (Subtarget.hasEmulatedSystemScopeAtomics())
+ return true;
} else if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics())
return true;
@@ -17877,8 +17944,7 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
case AtomicRMWInst::UMax: {
if (AMDGPU::isFlatGlobalAddrSpace(AS) ||
AS == AMDGPUAS::BUFFER_FAT_POINTER) {
- // Always expand system scope min/max atomics.
- if (HasSystemScope)
+ if (HasSystemScope && !Subtarget->hasEmulatedSystemScopeAtomics())
return AtomicExpansionKind::CmpXChg;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 3f61bbd..f20b22d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6122,10 +6122,11 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
!Op.isIdenticalTo(*MO))
return false;
- // Do not fold a frame index into an instruction that already has a frame
- // index. The frame index handling code doesn't handle fixing up operand
- // constraints if there are multiple indexes.
- if (Op.isFI() && MO->isFI())
+ // Do not fold a non-inlineable and non-register operand into an
+ // instruction that already has a frame index. The frame index handling
+ // code could not handle well when a frame index co-exists with another
+ // non-register operand, unless that operand is an inlineable immediate.
+ if (Op.isFI())
return false;
}
} else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() &&
@@ -10073,7 +10074,30 @@ unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
InstructionUniformity
SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
+ const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
unsigned opcode = MI.getOpcode();
+
+ auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) {
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
+ : MI.getOperand(1).getReg();
+ LLT DstTy = MRI.getType(Dst);
+ LLT SrcTy = MRI.getType(Src);
+ unsigned DstAS = DstTy.getAddressSpace();
+ unsigned SrcAS = SrcTy.getAddressSpace();
+ return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
+ DstAS == AMDGPUAS::FLAT_ADDRESS &&
+ ST.hasGloballyAddressableScratch()
+ ? InstructionUniformity::NeverUniform
+ : InstructionUniformity::Default;
+ };
+
+ // If the target supports globally addressable scratch, the mapping from
+ // scratch memory to the flat aperture changes therefore an address space cast
+ // is no longer uniform.
+ if (opcode == TargetOpcode::G_ADDRSPACE_CAST)
+ return HandleAddrSpaceCast(MI);
+
if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
auto IID = GI->getIntrinsicID();
if (AMDGPU::isIntrinsicSourceOfDivergence(IID))
@@ -10082,6 +10106,8 @@ SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
return InstructionUniformity::AlwaysUniform;
switch (IID) {
+ case Intrinsic::amdgcn_addrspacecast_nonnull:
+ return HandleAddrSpaceCast(MI);
case Intrinsic::amdgcn_if:
case Intrinsic::amdgcn_else:
// FIXME: Uniform if second result
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index ed6b973..81655f5 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -866,7 +866,8 @@ def TTMP_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16, v4bf16],
def SReg_64_XEXEC_XNULL : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16, v4bf16], 32,
(add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, SRC_SHARED_BASE,
- SRC_SHARED_LIMIT, SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, TTMP_64, TBA, TMA)> {
+ SRC_SHARED_LIMIT, SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, TTMP_64, TBA, TMA,
+ SRC_FLAT_SCRATCH_BASE)> {
let CopyCost = 1;
let AllocationPriority = 1;
let HasSGPR = 1;
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 8303410..431d73b 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1653,6 +1653,12 @@ def S_SETPRIO_INC_WG : SOPP_Pseudo <"s_setprio_inc_wg", (ins i16imm:$simm16), "$
let SubtargetPredicate = HasSetPrioIncWgInst;
}
+def S_GET_SHADER_CYCLES_U64 : SOP1_64_0 <"s_get_shader_cycles_u64",
+ [(set i64:$sdst, (readcyclecounter))]> {
+ let SubtargetPredicate = HasSGetShaderCyclesInst;
+ let hasSideEffects = 1;
+}
+
let Uses = [EXEC, M0] in {
def S_SENDMSG : SOPP_Pseudo <"s_sendmsg" , (ins SendMsg:$simm16), "$simm16",
[(int_amdgcn_s_sendmsg (i32 timm:$simm16), M0)]> {
@@ -2145,6 +2151,7 @@ defm S_ALLOC_VGPR : SOP1_Real_gfx12<0x053>;
defm S_SLEEP_VAR : SOP1_IMM_Real_gfx12<0x058>;
// GFX1250
+defm S_GET_SHADER_CYCLES_U64 : SOP1_Real_gfx12<0x06>;
defm S_ADD_PC_I64 : SOP1_Real_gfx12<0x04b>;
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
index e433b85..3d9455f 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
@@ -223,6 +223,10 @@ static constexpr CustomOperand Operands[] = {
{{"HW_REG_SQ_PERF_SNAPSHOT_PC_LO"}, ID_SQ_PERF_SNAPSHOT_PC_LO, isGFX940},
{{"HW_REG_SQ_PERF_SNAPSHOT_PC_HI"}, ID_SQ_PERF_SNAPSHOT_PC_HI, isGFX940},
+ // GFX1250
+ {{"HW_REG_XNACK_STATE_PRIV"}, ID_XNACK_STATE_PRIV, isGFX1250},
+ {{"HW_REG_XNACK_MASK"}, ID_XNACK_MASK_gfx1250, isGFX1250},
+
// Aliases
{{"HW_REG_HW_ID"}, ID_HW_ID1, isGFX10},
};
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 7f8b446..ea99cc4 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -737,7 +737,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
const RTLIB::LibcallImpl Impl;
} LibraryCalls[] = {
{RTLIB::FPROUND_F32_F16, RTLIB::__aeabi_f2h},
- {RTLIB::FPROUND_F64_F16, RTLIB::__aeabi_d2h},
{RTLIB::FPEXT_F16_F32, RTLIB::__aeabi_h2f},
};
@@ -20351,7 +20350,8 @@ static bool isIncompatibleReg(const MCPhysReg &PR, MVT VT) {
if (PR == 0 || VT == MVT::Other)
return false;
return (ARM::SPRRegClass.contains(PR) && VT != MVT::f32 && VT != MVT::i32) ||
- (ARM::DPRRegClass.contains(PR) && VT != MVT::f64);
+ (ARM::DPRRegClass.contains(PR) && VT != MVT::f64 &&
+ !VT.is64BitVector());
}
using RCPair = std::pair<unsigned, const TargetRegisterClass *>;
@@ -20784,9 +20784,8 @@ ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
Chain = SP.getValue(1);
SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size);
if (Align)
- SP = DAG.getNode(
- ISD::AND, DL, MVT::i32, SP.getValue(0),
- DAG.getSignedConstant(-(uint64_t)Align->value(), DL, MVT::i32));
+ SP = DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0),
+ DAG.getSignedConstant(-Align->value(), DL, MVT::i32));
Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP);
SDValue Ops[2] = { SP, Chain };
return DAG.getMergeValues(Ops, DL);
@@ -21359,7 +21358,9 @@ bool ARMTargetLowering::useLoadStackGuardNode(const Module &M) const {
}
void ARMTargetLowering::insertSSPDeclarations(Module &M) const {
- if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
+ RTLIB::LibcallImpl SecurityCheckCookieLibcall =
+ getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
+ if (SecurityCheckCookieLibcall == RTLIB::Unsupported)
return TargetLowering::insertSSPDeclarations(M);
// MSVC CRT has a global variable holding security cookie.
@@ -21368,23 +21369,32 @@ void ARMTargetLowering::insertSSPDeclarations(Module &M) const {
// MSVC CRT has a function to validate security cookie.
FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
- "__security_check_cookie", Type::getVoidTy(M.getContext()),
- PointerType::getUnqual(M.getContext()));
+ getLibcallImplName(SecurityCheckCookieLibcall),
+ Type::getVoidTy(M.getContext()), PointerType::getUnqual(M.getContext()));
if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee()))
F->addParamAttr(0, Attribute::AttrKind::InReg);
}
Value *ARMTargetLowering::getSDagStackGuard(const Module &M) const {
- // MSVC CRT has a global variable holding security cookie.
- if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
+ RTLIB::LibcallImpl SecurityCheckCookieLibcall =
+ getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
+ if (SecurityCheckCookieLibcall != RTLIB::Unsupported) {
+ // MSVC CRT has a global variable holding security cookie.
+ //
+ // FIXME: We have a libcall entry for the correlated check function, but not
+ // the global name.
return M.getGlobalVariable("__security_cookie");
+ }
+
return TargetLowering::getSDagStackGuard(M);
}
Function *ARMTargetLowering::getSSPStackGuardCheck(const Module &M) const {
// MSVC CRT has a function to validate security cookie.
- if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
- return M.getFunction("__security_check_cookie");
+ RTLIB::LibcallImpl SecurityCheckCookie =
+ getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
+ if (SecurityCheckCookie != RTLIB::Unsupported)
+ return M.getFunction(getLibcallImplName(SecurityCheckCookie));
return TargetLowering::getSSPStackGuardCheck(M);
}
diff --git a/llvm/lib/Target/AVR/AVRISelLowering.cpp b/llvm/lib/Target/AVR/AVRISelLowering.cpp
index 3955f2a..25ad9ec 100644
--- a/llvm/lib/Target/AVR/AVRISelLowering.cpp
+++ b/llvm/lib/Target/AVR/AVRISelLowering.cpp
@@ -669,7 +669,7 @@ SDValue AVRTargetLowering::getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
default: {
// Turn lhs < rhs with lhs constant into rhs >= lhs+1, this allows
// us to fold the constant into the cmp instruction.
- RHS = DAG.getConstant(C->getSExtValue() + 1, DL, VT);
+ RHS = DAG.getSignedConstant(C->getSExtValue() + 1, DL, VT);
CC = ISD::SETGE;
break;
}
@@ -713,7 +713,10 @@ SDValue AVRTargetLowering::getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
// Turn lhs < rhs with lhs constant into rhs >= lhs+1, this allows us to
// fold the constant into the cmp instruction.
if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
- RHS = DAG.getConstant(C->getSExtValue() + 1, DL, VT);
+ // Doing a "icmp ugt i16 65535, %0" comparison should have been converted
+ // already to something else. Assert to make sure this assumption holds.
+ assert((!C->isAllOnes()) && "integer overflow in comparison transform");
+ RHS = DAG.getConstant(C->getZExtValue() + 1, DL, VT);
CC = ISD::SETUGE;
break;
}
diff --git a/llvm/lib/Target/DirectX/DXILForwardHandleAccesses.cpp b/llvm/lib/Target/DirectX/DXILForwardHandleAccesses.cpp
index 73abfe7..306db6a 100644
--- a/llvm/lib/Target/DirectX/DXILForwardHandleAccesses.cpp
+++ b/llvm/lib/Target/DirectX/DXILForwardHandleAccesses.cpp
@@ -87,17 +87,50 @@ static bool forwardHandleAccesses(Function &F, DominatorTree &DT) {
for (LoadInst *LI : LoadsToProcess) {
Value *V = LI->getPointerOperand();
- auto *GV = dyn_cast<GlobalVariable>(LI->getPointerOperand());
+ auto *GV = dyn_cast<GlobalVariable>(V);
// If we didn't find the global, we may need to walk through a level of
// indirection. This generally happens at -O0.
- if (!GV)
+ if (!GV) {
if (auto *NestedLI = dyn_cast<LoadInst>(V)) {
BasicBlock::iterator BBI(NestedLI);
Value *Loaded = FindAvailableLoadedValue(
NestedLI, NestedLI->getParent(), BBI, 0, nullptr, nullptr);
GV = dyn_cast_or_null<GlobalVariable>(Loaded);
+ } else if (auto *NestedAlloca = dyn_cast<AllocaInst>(V)) {
+ for (auto &Use : NestedAlloca->uses()) {
+ auto *Store = dyn_cast<StoreInst>(Use.getUser());
+ if (!Store)
+ continue;
+
+ Value *StoredVal = Store->getValueOperand();
+ if (!StoredVal)
+ continue;
+
+ // Try direct global match
+ GV = dyn_cast<GlobalVariable>(StoredVal);
+ if (GV)
+ break;
+
+ // If it's a load, check its source
+ if (auto *Load = dyn_cast<LoadInst>(StoredVal)) {
+ GV = dyn_cast<GlobalVariable>(Load->getPointerOperand());
+ if (GV)
+ break;
+
+ // If loading from an unmodified stack copy of the global, reuse the
+ // global's value. Note: we are just repeating what we are doing for
+ // the load case for the alloca store pattern.
+ BasicBlock::iterator BBI(Load);
+ Value *Loaded = FindAvailableLoadedValue(Load, Load->getParent(),
+ BBI, 0, nullptr, nullptr);
+ GV = dyn_cast<GlobalVariable>(Loaded);
+ if (GV)
+ break;
+ }
+ }
}
+ }
auto It = HandleMap.find(GV);
if (It == HandleMap.end()) {
diff --git a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
index ffd900c..5153d24 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
@@ -56,6 +56,8 @@ bool DirectXTTIImpl::isTargetIntrinsicTriviallyScalarizable(
case Intrinsic::dx_wave_reduce_sum:
case Intrinsic::dx_wave_reduce_umax:
case Intrinsic::dx_wave_reduce_usum:
+ case Intrinsic::dx_imad:
+ case Intrinsic::dx_umad:
return true;
default:
return false;
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index a5bf0e5..6583a0f 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -6729,8 +6729,7 @@ static bool CC_LoongArchAssign2GRLen(unsigned GRLen, CCState &State,
static bool CC_LoongArch(const DataLayout &DL, LoongArchABI::ABI ABI,
unsigned ValNo, MVT ValVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
- CCState &State, bool IsFixed, bool IsRet,
- Type *OrigTy) {
+ CCState &State, bool IsRet, Type *OrigTy) {
unsigned GRLen = DL.getLargestLegalIntTypeSizeInBits();
assert((GRLen == 32 || GRLen == 64) && "Unspport GRLen");
MVT GRLenVT = GRLen == 32 ? MVT::i32 : MVT::i64;
@@ -6752,7 +6751,7 @@ static bool CC_LoongArch(const DataLayout &DL, LoongArchABI::ABI ABI,
case LoongArchABI::ABI_LP64F:
case LoongArchABI::ABI_ILP32D:
case LoongArchABI::ABI_LP64D:
- UseGPRForFloat = !IsFixed;
+ UseGPRForFloat = ArgFlags.isVarArg();
break;
case LoongArchABI::ABI_ILP32S:
case LoongArchABI::ABI_LP64S:
@@ -6766,7 +6765,8 @@ static bool CC_LoongArch(const DataLayout &DL, LoongArchABI::ABI ABI,
// will not be passed by registers if the original type is larger than
// 2*GRLen, so the register alignment rule does not apply.
unsigned TwoGRLenInBytes = (2 * GRLen) / 8;
- if (!IsFixed && ArgFlags.getNonZeroOrigAlign() == TwoGRLenInBytes &&
+ if (ArgFlags.isVarArg() &&
+ ArgFlags.getNonZeroOrigAlign() == TwoGRLenInBytes &&
DL.getTypeAllocSize(OrigTy) == TwoGRLenInBytes) {
unsigned RegIdx = State.getFirstUnallocated(ArgGPRs);
// Skip 'odd' register if necessary.
@@ -6916,7 +6916,7 @@ void LoongArchTargetLowering::analyzeInputArgs(
LoongArchABI::ABI ABI =
MF.getSubtarget<LoongArchSubtarget>().getTargetABI();
if (Fn(MF.getDataLayout(), ABI, i, ArgVT, CCValAssign::Full, Ins[i].Flags,
- CCInfo, /*IsFixed=*/true, IsRet, ArgTy)) {
+ CCInfo, IsRet, ArgTy)) {
LLVM_DEBUG(dbgs() << "InputArg #" << i << " has unhandled type " << ArgVT
<< '\n');
llvm_unreachable("");
@@ -6934,7 +6934,7 @@ void LoongArchTargetLowering::analyzeOutputArgs(
LoongArchABI::ABI ABI =
MF.getSubtarget<LoongArchSubtarget>().getTargetABI();
if (Fn(MF.getDataLayout(), ABI, i, ArgVT, CCValAssign::Full, Outs[i].Flags,
- CCInfo, Outs[i].IsFixed, IsRet, OrigTy)) {
+ CCInfo, IsRet, OrigTy)) {
LLVM_DEBUG(dbgs() << "OutputArg #" << i << " has unhandled type " << ArgVT
<< "\n");
llvm_unreachable("");
@@ -7647,8 +7647,7 @@ bool LoongArchTargetLowering::CanLowerReturn(
LoongArchABI::ABI ABI =
MF.getSubtarget<LoongArchSubtarget>().getTargetABI();
if (CC_LoongArch(MF.getDataLayout(), ABI, i, Outs[i].VT, CCValAssign::Full,
- Outs[i].Flags, CCInfo, /*IsFixed=*/true, /*IsRet=*/true,
- nullptr))
+ Outs[i].Flags, CCInfo, /*IsRet=*/true, nullptr))
return false;
}
return true;
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index 6b49a98f..f79ba74 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -330,7 +330,7 @@ private:
unsigned ValNo, MVT ValVT,
CCValAssign::LocInfo LocInfo,
ISD::ArgFlagsTy ArgFlags, CCState &State,
- bool IsFixed, bool IsRet, Type *OrigTy);
+ bool IsRet, Type *OrigTy);
void analyzeInputArgs(MachineFunction &MF, CCState &CCInfo,
const SmallVectorImpl<ISD::InputArg> &Ins, bool IsRet,
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index 5096a8f..d8bb16f 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -1651,20 +1651,19 @@ def : Pat<(vector_insert v8i32:$xd, GRLenVT:$rj, uimm3:$imm),
(XVINSGR2VR_W v8i32:$xd, GRLenVT:$rj, uimm3:$imm)>;
def : Pat<(vector_insert v4i64:$xd, GRLenVT:$rj, uimm2:$imm),
(XVINSGR2VR_D v4i64:$xd, GRLenVT:$rj, uimm2:$imm)>;
-def : Pat<(vector_insert v8f32:$xd, (loongarch_movgr2fr_w_la64 GPR:$rj), uimm3:$imm),
- (XVINSGR2VR_W $xd, $rj, uimm3:$imm)>;
-def : Pat<(vector_insert v4f64:$xd, (f64 (bitconvert i64:$rj)), uimm2:$imm),
- (XVINSGR2VR_D $xd, $rj, uimm2:$imm)>;
-def : Pat<(vector_insert v8f32:$xd, (f32 (vector_extract v8f32:$xj, uimm3:$imm1)), uimm3:$imm2),
- (XVINSGR2VR_W $xd, (XVPICKVE2GR_W v8f32:$xj, uimm3:$imm1), uimm3:$imm2)>;
-def : Pat<(vector_insert v4f64:$xd, (f64 (vector_extract v4f64:$xj, uimm2:$imm1)), uimm2:$imm2),
- (XVINSGR2VR_D $xd, (XVPICKVE2GR_D v4f64:$xj, uimm2:$imm1), uimm2:$imm2)>;
+def : Pat<(vector_insert v8f32:$xd, (loongarch_movgr2fr_w_la64 GPR:$rj),
+ uimm3:$imm),
+ (XVINSGR2VR_W v8f32:$xd, GPR:$rj, uimm3:$imm)>;
+def : Pat<(vector_insert v4f64:$xd, (f64(bitconvert i64:$rj)), uimm2:$imm),
+ (XVINSGR2VR_D v4f64:$xd, GPR:$rj, uimm2:$imm)>;
// XVINSVE0_{W/D}
def : Pat<(vector_insert v8f32:$xd, FPR32:$fj, uimm3:$imm),
- (XVINSVE0_W $xd, (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32), uimm3:$imm)>;
+ (XVINSVE0_W v8f32:$xd, (SUBREG_TO_REG(i64 0), FPR32:$fj, sub_32),
+ uimm3:$imm)>;
def : Pat<(vector_insert v4f64:$xd, FPR64:$fj, uimm2:$imm),
- (XVINSVE0_D $xd, (SUBREG_TO_REG (i64 0), FPR64:$fj, sub_64), uimm2:$imm)>;
+ (XVINSVE0_D v4f64:$xd, (SUBREG_TO_REG(i64 0), FPR64:$fj, sub_64),
+ uimm2:$imm)>;
// scalar_to_vector
def : Pat<(v8f32 (scalar_to_vector FPR32:$fj)),
@@ -1884,10 +1883,10 @@ def : Pat<(i64 (vector_extract v8i32:$xj, uimm3:$imm)),
(XVPICKVE2GR_W v8i32:$xj, uimm3:$imm)>;
def : Pat<(i64 (vector_extract v4i64:$xj, uimm2:$imm)),
(XVPICKVE2GR_D v4i64:$xj, uimm2:$imm)>;
-def : Pat<(f32 (vector_extract v8f32:$xj, uimm3:$imm)),
- (MOVGR2FR_W (XVPICKVE2GR_W v8f32:$xj, uimm3:$imm))>;
-def : Pat<(f64 (vector_extract v4f64:$xj, uimm2:$imm)),
- (MOVGR2FR_D (XVPICKVE2GR_D v4f64:$xj, uimm2:$imm))>;
+def : Pat<(f32(vector_extract v8f32:$xj, uimm3:$imm)),
+ (EXTRACT_SUBREG(XVPICKVE_W v8f32:$xj, uimm3:$imm), sub_32)>;
+def : Pat<(f64(vector_extract v4f64:$xj, uimm2:$imm)),
+ (EXTRACT_SUBREG(XVPICKVE_D v4f64:$xj, uimm2:$imm), sub_64)>;
// vselect
def : Pat<(v32i8 (vselect LASX256:$xd, (v32i8 (SplatPat_uimm8 uimm8:$imm)),
diff --git a/llvm/lib/Target/Mips/MipsCCState.cpp b/llvm/lib/Target/Mips/MipsCCState.cpp
index 9e8cd2e..13237c5 100644
--- a/llvm/lib/Target/Mips/MipsCCState.cpp
+++ b/llvm/lib/Target/Mips/MipsCCState.cpp
@@ -128,12 +128,10 @@ void MipsCCState::PreAnalyzeReturnValue(EVT ArgVT) {
OriginalRetWasFloatVector.push_back(originalEVTTypeIsVectorFloat(ArgVT));
}
-void MipsCCState::PreAnalyzeCallOperand(const Type *ArgTy, bool IsFixed,
- const char *Func) {
+void MipsCCState::PreAnalyzeCallOperand(const Type *ArgTy, const char *Func) {
OriginalArgWasF128.push_back(originalTypeIsF128(ArgTy, Func));
OriginalArgWasFloat.push_back(ArgTy->isFloatingPointTy());
OriginalArgWasFloatVector.push_back(ArgTy->isVectorTy());
- CallOperandIsFixed.push_back(IsFixed);
}
/// Identify lowered values that originated from f128, float and sret to vXfXX
@@ -148,7 +146,6 @@ void MipsCCState::PreAnalyzeCallOperands(
OriginalArgWasF128.push_back(originalTypeIsF128(FuncArg.Ty, Func));
OriginalArgWasFloat.push_back(FuncArg.Ty->isFloatingPointTy());
OriginalArgWasFloatVector.push_back(FuncArg.Ty->isVectorTy());
- CallOperandIsFixed.push_back(Outs[i].IsFixed);
}
}
diff --git a/llvm/lib/Target/Mips/MipsCCState.h b/llvm/lib/Target/Mips/MipsCCState.h
index 4229da5..30b68e8 100644
--- a/llvm/lib/Target/Mips/MipsCCState.h
+++ b/llvm/lib/Target/Mips/MipsCCState.h
@@ -36,7 +36,7 @@ public:
static bool originalEVTTypeIsVectorFloat(EVT Ty);
static bool originalTypeIsVectorFloat(const Type *Ty);
- void PreAnalyzeCallOperand(const Type *ArgTy, bool IsFixed, const char *Func);
+ void PreAnalyzeCallOperand(const Type *ArgTy, const char *Func);
void PreAnalyzeFormalArgument(const Type *ArgTy, ISD::ArgFlagsTy Flags);
void PreAnalyzeReturnValue(EVT ArgVT);
@@ -86,10 +86,6 @@ private:
/// vector.
SmallVector<bool, 4> OriginalRetWasFloatVector;
- /// Records whether the value was a fixed argument.
- /// See ISD::OutputArg::IsFixed,
- SmallVector<bool, 4> CallOperandIsFixed;
-
// Used to handle MIPS16-specific calling convention tweaks.
// FIXME: This should probably be a fully fledged calling convention.
SpecialCallingConvType SpecialCallingConv;
@@ -106,7 +102,6 @@ public:
OriginalArgWasF128.clear();
OriginalArgWasFloat.clear();
OriginalArgWasFloatVector.clear();
- CallOperandIsFixed.clear();
PreAnalyzeCallOperands(Outs, FuncArgs, Func);
}
@@ -213,7 +208,6 @@ public:
bool WasOriginalRetVectorFloat(unsigned ValNo) const {
return OriginalRetWasFloatVector[ValNo];
}
- bool IsCallOperandFixed(unsigned ValNo) { return CallOperandIsFixed[ValNo]; }
SpecialCallingConvType getSpecialCallingConv() { return SpecialCallingConv; }
};
}
diff --git a/llvm/lib/Target/Mips/MipsCallLowering.cpp b/llvm/lib/Target/Mips/MipsCallLowering.cpp
index 555773a..fa49108 100644
--- a/llvm/lib/Target/Mips/MipsCallLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsCallLowering.cpp
@@ -47,7 +47,7 @@ struct MipsOutgoingValueAssigner : public CallLowering::OutgoingValueAssigner {
if (IsReturn)
State.PreAnalyzeReturnValue(EVT::getEVT(Info.Ty));
else
- State.PreAnalyzeCallOperand(Info.Ty, Info.IsFixed, Func);
+ State.PreAnalyzeCallOperand(Info.Ty, Func);
return CallLowering::OutgoingValueAssigner::assignArg(
ValNo, OrigVT, ValVT, LocVT, LocInfo, Info, Flags, State);
diff --git a/llvm/lib/Target/Mips/MipsCallingConv.td b/llvm/lib/Target/Mips/MipsCallingConv.td
index 39e184a..0e5c16c 100644
--- a/llvm/lib/Target/Mips/MipsCallingConv.td
+++ b/llvm/lib/Target/Mips/MipsCallingConv.td
@@ -29,12 +29,6 @@ class CCIfOrigArgWasFloat<CCAction A>
class CCIfOrigArgWasF128<CCAction A>
: CCIf<"static_cast<MipsCCState *>(&State)->WasOriginalArgF128(ValNo)", A>;
-/// Match if this specific argument is a vararg.
-/// This is slightly different fro CCIfIsVarArg which matches if any argument is
-/// a vararg.
-class CCIfArgIsVarArg<CCAction A>
- : CCIf<"!static_cast<MipsCCState *>(&State)->IsCallOperandFixed(ValNo)", A>;
-
/// Match if the return was a floating point vector.
class CCIfOrigArgWasNotVectorFloat<CCAction A>
: CCIf<"!static_cast<MipsCCState *>(&State)"
@@ -344,7 +338,7 @@ def CC_Mips_VarArg : CallingConv<[
]>;
def CC_Mips : CallingConv<[
- CCIfVarArg<CCIfArgIsVarArg<CCDelegateTo<CC_Mips_VarArg>>>,
+ CCIfVarArg<CCIfArgVarArg<CCDelegateTo<CC_Mips_VarArg>>>,
CCDelegateTo<CC_Mips_FixedArg>
]>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 15f45a1..d4f0cc9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -900,6 +900,17 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
if (STI.allowFP16Math() || STI.hasBF16Math())
setTargetDAGCombine(ISD::SETCC);
+ // Vector reduction operations. These may be turned into shuffle or tree
+ // reductions depending on what instructions are available for each type.
+ for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
+ MVT EltVT = VT.getVectorElementType();
+ if (EltVT == MVT::f32 || EltVT == MVT::f64) {
+ setOperationAction({ISD::VECREDUCE_FMAX, ISD::VECREDUCE_FMIN,
+ ISD::VECREDUCE_FMAXIMUM, ISD::VECREDUCE_FMINIMUM},
+ VT, Custom);
+ }
+ }
+
// Promote fp16 arithmetic if fp16 hardware isn't available or the
// user passed --nvptx-no-fp16-math. The flag is useful because,
// although sm_53+ GPUs have some sort of FP16 support in
@@ -1143,6 +1154,10 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(NVPTXISD::BFI)
MAKE_CASE(NVPTXISD::PRMT)
MAKE_CASE(NVPTXISD::FCOPYSIGN)
+ MAKE_CASE(NVPTXISD::FMAXNUM3)
+ MAKE_CASE(NVPTXISD::FMINNUM3)
+ MAKE_CASE(NVPTXISD::FMAXIMUM3)
+ MAKE_CASE(NVPTXISD::FMINIMUM3)
MAKE_CASE(NVPTXISD::DYNAMIC_STACKALLOC)
MAKE_CASE(NVPTXISD::STACKRESTORE)
MAKE_CASE(NVPTXISD::STACKSAVE)
@@ -1929,6 +1944,124 @@ static SDValue getPRMT(SDValue A, SDValue B, uint64_t Selector, SDLoc DL,
return getPRMT(A, B, DAG.getConstant(Selector, DL, MVT::i32), DL, DAG, Mode);
}
+/// Reduces the elements using the scalar operations provided. The operations
+/// are sorted descending in number of inputs they take. The flags on the
+/// original reduction operation will be propagated to each scalar operation.
+/// Nearby elements are grouped in tree reduction, unlike the shuffle reduction
+/// used in ExpandReductions and SelectionDAG.
+static SDValue buildTreeReduction(
+ const SmallVector<SDValue> &Elements, EVT EltTy,
+ ArrayRef<std::pair<unsigned /*NodeType*/, unsigned /*NumInputs*/>> Ops,
+ const SDLoc &DL, const SDNodeFlags Flags, SelectionDAG &DAG) {
+ // Build the reduction tree at each level, starting with all the elements.
+ SmallVector<SDValue> Level = Elements;
+
+ unsigned OpIdx = 0;
+ while (Level.size() > 1) {
+ // Try to reduce this level using the current operator.
+ const auto [Op, NumInputs] = Ops[OpIdx];
+
+ // Build the next level by partially reducing all elements.
+ SmallVector<SDValue> ReducedLevel;
+ unsigned I = 0, E = Level.size();
+ for (; I + NumInputs <= E; I += NumInputs) {
+ // Reduce elements in groups of [NumInputs], as much as possible.
+ ReducedLevel.push_back(DAG.getNode(
+ Op, DL, EltTy, ArrayRef<SDValue>(Level).slice(I, NumInputs), Flags));
+ }
+
+ if (I < E) {
+ // Handle leftover elements.
+
+ if (ReducedLevel.empty()) {
+ // We didn't reduce anything at this level. We need to pick a smaller
+ // operator.
+ ++OpIdx;
+ assert(OpIdx < Ops.size() && "no smaller operators for reduction");
+ continue;
+ }
+
+ // We reduced some things but there's still more left, meaning the
+ // operator's number of inputs doesn't evenly divide this level size. Move
+ // these elements to the next level.
+ for (; I < E; ++I)
+ ReducedLevel.push_back(Level[I]);
+ }
+
+ // Process the next level.
+ Level = ReducedLevel;
+ }
+
+ return *Level.begin();
+}
+
+// Get scalar reduction opcode
+static ISD::NodeType getScalarOpcodeForReduction(unsigned ReductionOpcode) {
+ switch (ReductionOpcode) {
+ case ISD::VECREDUCE_FMAX:
+ return ISD::FMAXNUM;
+ case ISD::VECREDUCE_FMIN:
+ return ISD::FMINNUM;
+ case ISD::VECREDUCE_FMAXIMUM:
+ return ISD::FMAXIMUM;
+ case ISD::VECREDUCE_FMINIMUM:
+ return ISD::FMINIMUM;
+ default:
+ llvm_unreachable("unhandled reduction opcode");
+ }
+}
+
+/// Get 3-input scalar reduction opcode
+static std::optional<NVPTXISD::NodeType>
+getScalar3OpcodeForReduction(unsigned ReductionOpcode) {
+ switch (ReductionOpcode) {
+ case ISD::VECREDUCE_FMAX:
+ return NVPTXISD::FMAXNUM3;
+ case ISD::VECREDUCE_FMIN:
+ return NVPTXISD::FMINNUM3;
+ case ISD::VECREDUCE_FMAXIMUM:
+ return NVPTXISD::FMAXIMUM3;
+ case ISD::VECREDUCE_FMINIMUM:
+ return NVPTXISD::FMINIMUM3;
+ default:
+ return std::nullopt;
+ }
+}
+
+/// Lower reductions to either a sequence of operations or a tree if
+/// reassociations are allowed. This method will use larger operations like
+/// max3/min3 when the target supports them.
+SDValue NVPTXTargetLowering::LowerVECREDUCE(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ const SDNodeFlags Flags = Op->getFlags();
+ SDValue Vector = Op.getOperand(0);
+
+ const unsigned Opcode = Op->getOpcode();
+ const EVT EltTy = Vector.getValueType().getVectorElementType();
+
+ // Whether we can use 3-input min/max when expanding the reduction.
+ const bool CanUseMinMax3 =
+ EltTy == MVT::f32 && STI.getSmVersion() >= 100 &&
+ STI.getPTXVersion() >= 88 &&
+ (Opcode == ISD::VECREDUCE_FMAX || Opcode == ISD::VECREDUCE_FMIN ||
+ Opcode == ISD::VECREDUCE_FMAXIMUM || Opcode == ISD::VECREDUCE_FMINIMUM);
+
+ // A list of SDNode opcodes with equivalent semantics, sorted descending by
+ // number of inputs they take.
+ SmallVector<std::pair<unsigned /*Op*/, unsigned /*NumIn*/>, 2> ScalarOps;
+
+ if (auto Opcode3Elem = getScalar3OpcodeForReduction(Opcode);
+ CanUseMinMax3 && Opcode3Elem)
+ ScalarOps.push_back({*Opcode3Elem, 3});
+ ScalarOps.push_back({getScalarOpcodeForReduction(Opcode), 2});
+
+ SmallVector<SDValue> Elements;
+ DAG.ExtractVectorElements(Vector, Elements);
+
+ return buildTreeReduction(Elements, EltTy, ScalarOps, DL, Flags, DAG);
+}
+
SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
// Handle bitcasting from v2i8 without hitting the default promotion
// strategy which goes through stack memory.
@@ -2808,6 +2941,11 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return LowerVECTOR_SHUFFLE(Op, DAG);
case ISD::CONCAT_VECTORS:
return LowerCONCAT_VECTORS(Op, DAG);
+ case ISD::VECREDUCE_FMAX:
+ case ISD::VECREDUCE_FMIN:
+ case ISD::VECREDUCE_FMAXIMUM:
+ case ISD::VECREDUCE_FMINIMUM:
+ return LowerVECREDUCE(Op, DAG);
case ISD::STORE:
return LowerSTORE(Op, DAG);
case ISD::LOAD:
@@ -3908,6 +4046,18 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
return true;
}
+ case Intrinsic::nvvm_prefetch_tensormap: {
+ auto &DL = I.getDataLayout();
+ Info.opc = ISD::INTRINSIC_VOID;
+ Info.memVT = getPointerTy(DL);
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.flags =
+ MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable;
+ Info.align.reset();
+ return true;
+ }
+
case Intrinsic::nvvm_ldu_global_i:
case Intrinsic::nvvm_ldu_global_f:
case Intrinsic::nvvm_ldu_global_p: {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index cf72a1e..43e721a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -64,6 +64,11 @@ enum NodeType : unsigned {
UNPACK_VECTOR,
FCOPYSIGN,
+ FMAXNUM3,
+ FMINNUM3,
+ FMAXIMUM3,
+ FMINIMUM3,
+
DYNAMIC_STACKALLOC,
STACKRESTORE,
STACKSAVE,
@@ -286,6 +291,7 @@ private:
SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 6765ecb..1ab41bf 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -347,6 +347,36 @@ multiclass FMINIMUMMAXIMUM<string OpcStr, bit NaN, SDNode OpNode> {
Requires<[hasBF16Math, hasSM<80>, hasPTX<70>]>;
}
+// Template for 3-input minimum/maximum instructions
+// (sm_100+/PTX 8.8 and f32 only)
+//
+// Also defines ftz (flush subnormal inputs and results to sign-preserving
+// zero) variants for fp32 functions.
+multiclass FMINIMUMMAXIMUM3<string OpcStr, bit NaN, SDNode OpNode> {
+ defvar nan_str = !if(NaN, ".NaN", "");
+ def f32rrr :
+ BasicFlagsNVPTXInst<(outs B32:$dst),
+ (ins B32:$a, B32:$b, B32:$c),
+ (ins FTZFlag:$ftz),
+ OpcStr # "$ftz" # nan_str # ".f32",
+ [(set f32:$dst, (OpNode f32:$a, f32:$b, f32:$c))]>,
+ Requires<[hasPTX<88>, hasSM<100>]>;
+ def f32rri :
+ BasicFlagsNVPTXInst<(outs B32:$dst),
+ (ins B32:$a, B32:$b, f32imm:$c),
+ (ins FTZFlag:$ftz),
+ OpcStr # "$ftz" # nan_str # ".f32",
+ [(set f32:$dst, (OpNode f32:$a, f32:$b, fpimm:$c))]>,
+ Requires<[hasPTX<88>, hasSM<100>]>;
+ def f32rii :
+ BasicFlagsNVPTXInst<(outs B32:$dst),
+ (ins B32:$a, f32imm:$b, f32imm:$c),
+ (ins FTZFlag:$ftz),
+ OpcStr # "$ftz" # nan_str # ".f32",
+ [(set f32:$dst, (OpNode f32:$a, fpimm:$b, fpimm:$c))]>,
+ Requires<[hasPTX<88>, hasSM<100>]>;
+}
+
// Template for instructions which take three FP args. The
// instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64").
//
@@ -900,6 +930,20 @@ defm MAX : FMINIMUMMAXIMUM<"max", /* NaN */ false, fmaxnum>;
defm MIN_NAN : FMINIMUMMAXIMUM<"min", /* NaN */ true, fminimum>;
defm MAX_NAN : FMINIMUMMAXIMUM<"max", /* NaN */ true, fmaximum>;
+def nvptx_fminnum3 : SDNode<"NVPTXISD::FMINNUM3", SDTFPTernaryOp,
+ [SDNPCommutative]>;
+def nvptx_fmaxnum3 : SDNode<"NVPTXISD::FMAXNUM3", SDTFPTernaryOp,
+ [SDNPCommutative]>;
+def nvptx_fminimum3 : SDNode<"NVPTXISD::FMINIMUM3", SDTFPTernaryOp,
+ [SDNPCommutative]>;
+def nvptx_fmaximum3 : SDNode<"NVPTXISD::FMAXIMUM3", SDTFPTernaryOp,
+ [SDNPCommutative]>;
+
+defm FMIN3 : FMINIMUMMAXIMUM3<"min", /* NaN */ false, nvptx_fminnum3>;
+defm FMAX3 : FMINIMUMMAXIMUM3<"max", /* NaN */ false, nvptx_fmaxnum3>;
+defm FMINNAN3 : FMINIMUMMAXIMUM3<"min", /* NaN */ true, nvptx_fminimum3>;
+defm FMAXNAN3 : FMINIMUMMAXIMUM3<"max", /* NaN */ true, nvptx_fmaximum3>;
+
defm FABS : F2<"abs", fabs>;
defm FNEG : F2<"neg", fneg>;
defm FABS_H: F2_Support_Half<"abs", fabs>;
@@ -1560,18 +1604,6 @@ def : Pat<(setcc (i16 (sext_inreg (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel
(PRMT_B32rii i32:$b, 0, (to_sign_extend_selector $sel_b), PrmtNONE),
(cond2cc $cc))>;
-// A 16-bit comparison of truncated byte extracts can be be converted to 32-bit
-// comparison because we know that the truncate is just trancating off zeros
-// and that the most-significant byte is also zeros so the meaning of signed and
-// unsigned comparisons will not be changed.
-def : Pat<(setcc (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))),
- (i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))),
- cond:$cc),
- (SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE),
- (PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE),
- (cond2cc $cc))>;
-
-
def SDTDeclareArrayParam :
SDTypeProfile<0, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>]>;
def SDTDeclareScalarParam :
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index d337192..d4a0ca7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -39,6 +39,12 @@ def AS_match {
code global = [{
return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GLOBAL);
}];
+ code const = [{
+ return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_CONST);
+ }];
+ code param = [{
+ return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_PARAM);
+ }];
}
@@ -950,33 +956,47 @@ foreach dim = 3...5 in {
defm TMA_TENSOR_PF_TILE_GATHER4_2D : TMA_TENSOR_PREFETCH_INTR<5, "tile_gather4",
[hasTMACTAGroupSupport]>;
-//Prefetch and Prefetchu
-
-let Predicates = [hasPTX<80>, hasSM<90>] in {
- class PREFETCH_INTRS<string InstName> :
- BasicNVPTXInst<(outs), (ins ADDR:$addr),
- InstName,
- [(!cast<Intrinsic>(!strconcat("int_nvvm_",
- !subst(".", "_", InstName))) addr:$addr)]>;
+//Prefetchu and Prefetch
- def PREFETCH_L1 : PREFETCH_INTRS<"prefetch.L1">;
- def PREFETCH_L2 : PREFETCH_INTRS<"prefetch.L2">;
- def PREFETCH_GLOBAL_L1 : PREFETCH_INTRS<"prefetch.global.L1">;
- def PREFETCH_LOCAL_L1 : PREFETCH_INTRS<"prefetch.local.L1">;
- def PREFETCH_GLOBAL_L2 : PREFETCH_INTRS<"prefetch.global.L2">;
- def PREFETCH_LOCAL_L2 : PREFETCH_INTRS<"prefetch.local.L2">;
+defvar frag_pat = (int_nvvm_prefetch_tensormap node:$addr);
- def PREFETCH_GLOBAL_L2_EVICT_NORMAL : BasicNVPTXInst<(outs), (ins ADDR:$addr),
- "prefetch.global.L2::evict_normal",
- [(int_nvvm_prefetch_global_L2_evict_normal addr:$addr)]>;
+multiclass PREFETCH_TENSORMAP_PATFRAG<string suffix, code predicate> {
+ def !tolower(suffix) : PatFrag<!setdagop(frag_pat, ops), frag_pat, predicate>;
+}
- def PREFETCH_GLOBAL_L2_EVICT_LAST : BasicNVPTXInst<(outs), (ins ADDR:$addr),
- "prefetch.global.L2::evict_last",
- [(int_nvvm_prefetch_global_L2_evict_last addr:$addr)]>;
+defm prefetch_tensormap_ : PREFETCH_TENSORMAP_PATFRAG<"CONST", AS_match.const>;
+defm prefetch_tensormap_ : PREFETCH_TENSORMAP_PATFRAG<"GENERIC", AS_match.generic>;
+defm prefetch_tensormap_ : PREFETCH_TENSORMAP_PATFRAG<"PARAM", AS_match.param>;
- def PREFETCHU_L1 : PREFETCH_INTRS<"prefetchu.L1">;
+multiclass PREFETCH_TENSORMAP_INST<string addrspace_name, PatFrag pattern_frag> {
+ def "" : BasicNVPTXInst<(outs), (ins ADDR:$addr),
+ "prefetch" # addrspace_name # ".tensormap",
+ [(pattern_frag addr:$addr)]>,
+ Requires<[hasPTX<80>, hasSM<90>]>;
}
+defm PREFETCH_CONST_TENSORMAP : PREFETCH_TENSORMAP_INST<".const", prefetch_tensormap_const>;
+defm PREFETCH_GENERIC_TENSORMAP : PREFETCH_TENSORMAP_INST<"", prefetch_tensormap_generic>;
+defm PREFETCH_PARAM_TENSORMAP : PREFETCH_TENSORMAP_INST<".param", prefetch_tensormap_param>;
+
+class PREFETCH_INTRS<string InstName, Intrinsic Intr> :
+ BasicNVPTXInst<(outs), (ins ADDR:$addr),
+ InstName,
+ [(Intr addr:$addr)]>,
+ Requires<[hasPTX<80>, hasSM<90>]>;
+
+def PREFETCHU_L1 : PREFETCH_INTRS<"prefetchu.L1", int_nvvm_prefetchu_L1>;
+def PREFETCH_L1 : PREFETCH_INTRS<"prefetch.L1", int_nvvm_prefetch_L1>;
+def PREFETCH_L2 : PREFETCH_INTRS<"prefetch.L2", int_nvvm_prefetch_L2>;
+def PREFETCH_GLOBAL_L1 : PREFETCH_INTRS<"prefetch.global.L1", int_nvvm_prefetch_global_L1>;
+def PREFETCH_LOCAL_L1 : PREFETCH_INTRS<"prefetch.local.L1", int_nvvm_prefetch_local_L1>;
+def PREFETCH_GLOBAL_L2 : PREFETCH_INTRS<"prefetch.global.L2", int_nvvm_prefetch_global_L2>;
+def PREFETCH_LOCAL_L2 : PREFETCH_INTRS<"prefetch.local.L2", int_nvvm_prefetch_local_L2>;
+def PREFETCH_GLOBAL_L2_EVICT_NORMAL : PREFETCH_INTRS<"prefetch.global.L2::evict_normal",
+ int_nvvm_prefetch_global_L2_evict_normal>;
+def PREFETCH_GLOBAL_L2_EVICT_LAST : PREFETCH_INTRS<"prefetch.global.L2::evict_last",
+ int_nvvm_prefetch_global_L2_evict_last>;
+
//Applypriority intrinsics
class APPLYPRIORITY_L2_INTRS<string addrspace> :
BasicNVPTXInst<(outs), (ins ADDR:$addr, B64:$size),
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index 3ae2d9d..f4f8961 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -564,7 +564,8 @@ bool NVPTXTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
case Intrinsic::nvvm_isspacep_global:
case Intrinsic::nvvm_isspacep_local:
case Intrinsic::nvvm_isspacep_shared:
- case Intrinsic::nvvm_isspacep_shared_cluster: {
+ case Intrinsic::nvvm_isspacep_shared_cluster:
+ case Intrinsic::nvvm_prefetch_tensormap: {
OpIndexes.push_back(0);
return true;
}
@@ -587,6 +588,11 @@ Value *NVPTXTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
return ConstantInt::get(II->getType(), *R);
return nullptr;
}
+ case Intrinsic::nvvm_prefetch_tensormap: {
+ IRBuilder<> Builder(II);
+ return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_prefetch_tensormap,
+ NewV);
+ }
}
return nullptr;
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index 9a6e261..b32d931b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -87,6 +87,13 @@ public:
}
unsigned getMinVectorRegisterBitWidth() const override { return 32; }
+ bool shouldExpandReduction(const IntrinsicInst *II) const override {
+ // Turn off ExpandReductions pass for NVPTX, which doesn't have advanced
+ // swizzling operations. Our backend/Selection DAG can expand these
+ // reductions with less movs.
+ return false;
+ }
+
// We don't want to prevent inlining because of target-cpu and -features
// attributes that were added to newer versions of LLVM/Clang: There are
// no incompatible functions in PTX, ptxas will throw errors in such cases.
diff --git a/llvm/lib/Target/PowerPC/PPCCCState.h b/llvm/lib/Target/PowerPC/PPCCCState.h
index b0e50b2..feab9c5 100644
--- a/llvm/lib/Target/PowerPC/PPCCCState.h
+++ b/llvm/lib/Target/PowerPC/PPCCCState.h
@@ -38,36 +38,6 @@ public:
void clearWasPPCF128() { OriginalArgWasPPCF128.clear(); }
};
-class AIXCCState : public CCState {
-private:
- BitVector IsFixed;
-
-public:
- AIXCCState(CallingConv::ID CC, bool IsVarArg, MachineFunction &MF,
- SmallVectorImpl<CCValAssign> &Locs, LLVMContext &C)
- : CCState(CC, IsVarArg, MF, Locs, C) {}
-
- void AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
- CCAssignFn Fn) {
- // All formal arguments are fixed.
- IsFixed.resize(Ins.size(), true);
- CCState::AnalyzeFormalArguments(Ins, Fn);
- }
-
- void AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
- CCAssignFn Fn) {
- // Record whether the call operand was a fixed argument.
- IsFixed.resize(Outs.size(), false);
- for (unsigned ValNo = 0, E = Outs.size(); ValNo != E; ++ValNo)
- if (Outs[ValNo].IsFixed)
- IsFixed.set(ValNo);
-
- CCState::AnalyzeCallOperands(Outs, Fn);
- }
-
- bool isFixed(unsigned ValNo) const { return IsFixed.test(ValNo); }
-};
-
} // end namespace llvm
#endif
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 30b5fd6..2698bd6 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -3925,9 +3925,6 @@ SDValue PPCTargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
SDValue PPCTargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
SelectionDAG &DAG) const {
- if (Subtarget.isAIXABI())
- report_fatal_error("ADJUST_TRAMPOLINE operation is not supported on AIX.");
-
return Op.getOperand(0);
}
@@ -3984,9 +3981,6 @@ SDValue PPCTargetLowering::LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const {
SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
SelectionDAG &DAG) const {
- if (Subtarget.isAIXABI())
- report_fatal_error("INIT_TRAMPOLINE operation is not supported on AIX.");
-
SDValue Chain = Op.getOperand(0);
SDValue Trmp = Op.getOperand(1); // trampoline
SDValue FPtr = Op.getOperand(2); // nested function
@@ -3994,6 +3988,65 @@ SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
SDLoc dl(Op);
EVT PtrVT = getPointerTy(DAG.getDataLayout());
+
+ if (Subtarget.isAIXABI()) {
+ // On AIX we create a trampoline descriptor by combining the
+ // entry point and TOC from the global descriptor (FPtr) with the
+ // nest argument as the environment pointer.
+ uint64_t PointerSize = Subtarget.isPPC64() ? 8 : 4;
+ MaybeAlign PointerAlign(PointerSize);
+ auto MMOFlags = Subtarget.hasInvariantFunctionDescriptors()
+ ? (MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant)
+ : MachineMemOperand::MONone;
+
+ uint64_t TOCPointerOffset = 1 * PointerSize;
+ uint64_t EnvPointerOffset = 2 * PointerSize;
+ SDValue SDTOCPtrOffset = DAG.getConstant(TOCPointerOffset, dl, PtrVT);
+ SDValue SDEnvPtrOffset = DAG.getConstant(EnvPointerOffset, dl, PtrVT);
+
+ const Value *TrampolineAddr =
+ cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+ const Function *Func =
+ cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
+
+ SDValue OutChains[3];
+
+ // Copy the entry point address from the global descriptor to the
+ // trampoline buffer.
+ SDValue LoadEntryPoint =
+ DAG.getLoad(PtrVT, dl, Chain, FPtr, MachinePointerInfo(Func, 0),
+ PointerAlign, MMOFlags);
+ SDValue EPLoadChain = LoadEntryPoint.getValue(1);
+ OutChains[0] = DAG.getStore(EPLoadChain, dl, LoadEntryPoint, Trmp,
+ MachinePointerInfo(TrampolineAddr, 0));
+
+ // Copy the TOC pointer from the global descriptor to the trampoline
+ // buffer.
+ SDValue TOCFromDescriptorPtr =
+ DAG.getNode(ISD::ADD, dl, PtrVT, FPtr, SDTOCPtrOffset);
+ SDValue TOCReg = DAG.getLoad(PtrVT, dl, Chain, TOCFromDescriptorPtr,
+ MachinePointerInfo(Func, TOCPointerOffset),
+ PointerAlign, MMOFlags);
+ SDValue TrampolineTOCPointer =
+ DAG.getNode(ISD::ADD, dl, PtrVT, Trmp, SDTOCPtrOffset);
+ SDValue TOCLoadChain = TOCReg.getValue(1);
+ OutChains[1] =
+ DAG.getStore(TOCLoadChain, dl, TOCReg, TrampolineTOCPointer,
+ MachinePointerInfo(TrampolineAddr, TOCPointerOffset));
+
+ // Store the nest argument into the environment pointer in the trampoline
+ // buffer.
+ SDValue EnvPointer = DAG.getNode(ISD::ADD, dl, PtrVT, Trmp, SDEnvPtrOffset);
+ OutChains[2] =
+ DAG.getStore(Chain, dl, Nest, EnvPointer,
+ MachinePointerInfo(TrampolineAddr, EnvPointerOffset));
+
+ SDValue TokenFactor =
+ DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
+ return TokenFactor;
+ }
+
bool isPPC64 = (PtrVT == MVT::i64);
Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
@@ -6036,7 +6089,7 @@ SDValue PPCTargetLowering::LowerCall_32SVR4(
ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
bool Result;
- if (Outs[i].IsFixed) {
+ if (!ArgFlags.isVarArg()) {
Result = CC_PPC32_SVR4(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags,
CCInfo);
} else {
@@ -6852,8 +6905,7 @@ static bool isGPRShadowAligned(MCPhysReg Reg, Align RequiredAlign) {
static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
- CCState &S) {
- AIXCCState &State = static_cast<AIXCCState &>(S);
+ CCState &State) {
const PPCSubtarget &Subtarget = static_cast<const PPCSubtarget &>(
State.getMachineFunction().getSubtarget());
const bool IsPPC64 = Subtarget.isPPC64();
@@ -6865,9 +6917,6 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
if (ValVT == MVT::f128)
report_fatal_error("f128 is unimplemented on AIX.");
- if (ArgFlags.isNest())
- report_fatal_error("Nest arguments are unimplemented.");
-
static const MCPhysReg GPR_32[] = {// 32-bit registers.
PPC::R3, PPC::R4, PPC::R5, PPC::R6,
PPC::R7, PPC::R8, PPC::R9, PPC::R10};
@@ -6882,6 +6931,14 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
const ArrayRef<MCPhysReg> GPRs = IsPPC64 ? GPR_64 : GPR_32;
+ if (ArgFlags.isNest()) {
+ MCRegister EnvReg = State.AllocateReg(IsPPC64 ? PPC::X11 : PPC::R11);
+ if (!EnvReg)
+ report_fatal_error("More then one nest argument.");
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, EnvReg, RegVT, LocInfo));
+ return false;
+ }
+
if (ArgFlags.isByVal()) {
const Align ByValAlign(ArgFlags.getNonZeroByValAlign());
if (ByValAlign > StackAlign)
@@ -7032,7 +7089,7 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
// They are passed in VRs if any are available (unlike arguments passed
// through ellipses) and shadow GPRs (unlike arguments to non-vaarg
// functions)
- if (State.isFixed(ValNo)) {
+ if (!ArgFlags.isVarArg()) {
if (MCRegister VReg = State.AllocateReg(VR)) {
State.addLoc(CCValAssign::getReg(ValNo, ValVT, VReg, LocVT, LocInfo));
// Shadow allocate GPRs and stack space even though we pass in a VR.
@@ -7220,7 +7277,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX(
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
- AIXCCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
+ CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
const EVT PtrVT = getPointerTy(MF.getDataLayout());
// Reserve space for the linkage area on the stack.
@@ -7567,8 +7624,8 @@ SDValue PPCTargetLowering::LowerCall_AIX(
MachineFunction &MF = DAG.getMachineFunction();
SmallVector<CCValAssign, 16> ArgLocs;
- AIXCCState CCInfo(CFlags.CallConv, CFlags.IsVarArg, MF, ArgLocs,
- *DAG.getContext());
+ CCState CCInfo(CFlags.CallConv, CFlags.IsVarArg, MF, ArgLocs,
+ *DAG.getContext());
// Reserve space for the linkage save area (LSA) on the stack.
// In both PPC32 and PPC64 there are 6 reserved slots in the LSA:
@@ -9593,12 +9650,14 @@ static bool isValidSplatLoad(const PPCSubtarget &Subtarget, const SDValue &Op,
return false;
}
-bool isValidMtVsrBmi(APInt &BitMask, BuildVectorSDNode &BVN) {
+bool isValidMtVsrBmi(APInt &BitMask, BuildVectorSDNode &BVN,
+ bool IsLittleEndian) {
assert(BVN.getNumOperands() > 0 && "Unexpected 0-size build vector");
BitMask.clearAllBits();
EVT VT = BVN.getValueType(0);
- APInt ConstValue(VT.getSizeInBits(), 0);
+ unsigned VTSize = VT.getSizeInBits();
+ APInt ConstValue(VTSize, 0);
unsigned EltWidth = VT.getScalarSizeInBits();
@@ -9608,8 +9667,10 @@ bool isValidMtVsrBmi(APInt &BitMask, BuildVectorSDNode &BVN) {
if (!CN)
return false;
-
- ConstValue.insertBits(CN->getAPIntValue().zextOrTrunc(EltWidth), BitPos);
+ // The elements in a vector register are ordered in reverse byte order
+ // between little-endian and big-endian modes.
+ ConstValue.insertBits(CN->getAPIntValue().zextOrTrunc(EltWidth),
+ IsLittleEndian ? BitPos : VTSize - EltWidth - BitPos);
BitPos += EltWidth;
}
@@ -9640,7 +9701,8 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
// we do not convert it to MTVSRBMI.
// The xxleqv instruction sets a vector with all ones.
// The xxlxor instruction sets a vector with all zeros.
- if (isValidMtVsrBmi(BitMask, *BVN) && BitMask != 0 && BitMask != 0xffff) {
+ if (isValidMtVsrBmi(BitMask, *BVN, Subtarget.isLittleEndian()) &&
+ BitMask != 0 && BitMask != 0xffff) {
SDValue SDConstant = DAG.getTargetConstant(BitMask, dl, MVT::i32);
MachineSDNode *MSDNode =
DAG.getMachineNode(PPC::MTVSRBMI, dl, MVT::v16i8, SDConstant);
diff --git a/llvm/lib/Target/PowerPC/PPCInstrP10.td b/llvm/lib/Target/PowerPC/PPCInstrP10.td
index 1dc485d..98dd846 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrP10.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrP10.td
@@ -2175,10 +2175,7 @@ let AddedComplexity = 400, Predicates = [IsISA3_1, HasVSX] in {
// - Other vector types [v16i8, v8i16] require COPY_TO_REGCLASS to/from VRRC
// =============================================================================
-class XXEvalPattern<dag pattern, bits<8> imm>
- : Pat<(v4i32 pattern), (XXEVAL $vA, $vB, $vC, imm)> {}
-
-class XXEvalPatterns<ValueType Vt, dag InputPattern, bits<8> Imm>
+class XXEvalPattern<ValueType Vt, dag InputPattern, bits<8> Imm>
: Pat<(Vt InputPattern),
!if(!or(!eq(Vt, v4i32), !eq(Vt, v2i64)),
// VSRC path: direct XXEVAL for v4i32 and v2i64
@@ -2246,26 +2243,26 @@ def VEqv
// =============================================================================
multiclass XXEvalTernarySelectAnd<ValueType Vt> {
// Pattern: A ? XOR(B,C) : AND(B,C) XXEVAL immediate value: 22
- def : XXEvalPatterns<
+ def : XXEvalPattern<
Vt, (vselect Vt:$vA, (VXor Vt:$vB, Vt:$vC), (VAnd Vt:$vB, Vt:$vC)),
22>;
// Pattern: A ? NOR(B,C) : AND(B,C) XXEVAL immediate value: 24
- def : XXEvalPatterns<
+ def : XXEvalPattern<
Vt, (vselect Vt:$vA, (VNor Vt:$vB, Vt:$vC), (VAnd Vt:$vB, Vt:$vC)),
24>;
// Pattern: A ? EQV(B,C) : AND(B,C) XXEVAL immediate value: 25
- def : XXEvalPatterns<
+ def : XXEvalPattern<
Vt, (vselect Vt:$vA, (VEqv Vt:$vB, Vt:$vC), (VAnd Vt:$vB, Vt:$vC)),
25>;
// Pattern: A ? NOT(C) : AND(B,C) XXEVAL immediate value: 26
- def : XXEvalPatterns<
+ def : XXEvalPattern<
Vt, (vselect Vt:$vA, (VNot Vt:$vC), (VAnd Vt:$vB, Vt:$vC)), 26>;
// Pattern: A ? NOT(B) : AND(B,C) XXEVAL immediate value: 28
- def : XXEvalPatterns<
+ def : XXEvalPattern<
Vt, (vselect Vt:$vA, (VNot Vt:$vB), (VAnd Vt:$vB, Vt:$vC)), 28>;
}
@@ -2299,83 +2296,83 @@ let Predicates = [PrefixInstrs, HasP10Vector] in {
// Anonymous patterns for XXEVAL
// AND
// and(A, B, C)
- def : XXEvalPattern<(and v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), 1>;
+ def : XXEvalPattern<v4i32, (and v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), 1>;
// and(A, xor(B, C))
- def : XXEvalPattern<(and v4i32:$vA, (xor v4i32:$vB, v4i32:$vC)), 6>;
+ def : XXEvalPattern<v4i32, (and v4i32:$vA, (xor v4i32:$vB, v4i32:$vC)), 6>;
// and(A, or(B, C))
- def : XXEvalPattern<(and v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 7>;
+ def : XXEvalPattern<v4i32, (and v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 7>;
// and(A, nor(B, C))
- def : XXEvalPattern<(and v4i32:$vA, (vnot (or v4i32:$vB, v4i32:$vC))), 8>;
+ def : XXEvalPattern<v4i32, (and v4i32:$vA, (vnot (or v4i32:$vB, v4i32:$vC))), 8>;
// and(A, eqv(B, C))
- def : XXEvalPattern<(and v4i32:$vA, (vnot (xor v4i32:$vB, v4i32:$vC))), 9>;
+ def : XXEvalPattern<v4i32, (and v4i32:$vA, (vnot (xor v4i32:$vB, v4i32:$vC))), 9>;
// and(A, nand(B, C))
- def : XXEvalPattern<(and v4i32:$vA, (vnot (and v4i32:$vB, v4i32:$vC))), 14>;
+ def : XXEvalPattern<v4i32, (and v4i32:$vA, (vnot (and v4i32:$vB, v4i32:$vC))), 14>;
// NAND
// nand(A, B, C)
- def : XXEvalPattern<(vnot (and v4i32:$vA, (and v4i32:$vB, v4i32:$vC))),
+ def : XXEvalPattern<v4i32, (vnot (and v4i32:$vA, (and v4i32:$vB, v4i32:$vC))),
!sub(255, 1)>;
// nand(A, xor(B, C))
- def : XXEvalPattern<(vnot (and v4i32:$vA, (xor v4i32:$vB, v4i32:$vC))),
+ def : XXEvalPattern<v4i32, (vnot (and v4i32:$vA, (xor v4i32:$vB, v4i32:$vC))),
!sub(255, 6)>;
// nand(A, or(B, C))
- def : XXEvalPattern<(vnot (and v4i32:$vA, (or v4i32:$vB, v4i32:$vC))),
+ def : XXEvalPattern<v4i32, (vnot (and v4i32:$vA, (or v4i32:$vB, v4i32:$vC))),
!sub(255, 7)>;
// nand(A, nor(B, C))
- def : XXEvalPattern<(or (vnot v4i32:$vA), (or v4i32:$vB, v4i32:$vC)),
+ def : XXEvalPattern<v4i32, (or (vnot v4i32:$vA), (or v4i32:$vB, v4i32:$vC)),
!sub(255, 8)>;
// nand(A, eqv(B, C))
- def : XXEvalPattern<(or (vnot v4i32:$vA), (xor v4i32:$vB, v4i32:$vC)),
+ def : XXEvalPattern<v4i32, (or (vnot v4i32:$vA), (xor v4i32:$vB, v4i32:$vC)),
!sub(255, 9)>;
// nand(A, nand(B, C))
- def : XXEvalPattern<(or (vnot v4i32:$vA), (and v4i32:$vB, v4i32:$vC)),
+ def : XXEvalPattern<v4i32, (or (vnot v4i32:$vA), (and v4i32:$vB, v4i32:$vC)),
!sub(255, 14)>;
// EQV
// (eqv A, B, C)
- def : XXEvalPattern<(or (and v4i32:$vA, (and v4i32:$vB, v4i32:$vC)),
+ def : XXEvalPattern<v4i32, (or (and v4i32:$vA, (and v4i32:$vB, v4i32:$vC)),
(vnot (or v4i32:$vA, (or v4i32:$vB, v4i32:$vC)))),
150>;
// (eqv A, (and B, C))
- def : XXEvalPattern<(vnot (xor v4i32:$vA, (and v4i32:$vB, v4i32:$vC))), 225>;
+ def : XXEvalPattern<v4i32, (vnot (xor v4i32:$vA, (and v4i32:$vB, v4i32:$vC))), 225>;
// (eqv A, (or B, C))
- def : XXEvalPattern<(vnot (xor v4i32:$vA, (or v4i32:$vB, v4i32:$vC))), 135>;
+ def : XXEvalPattern<v4i32, (vnot (xor v4i32:$vA, (or v4i32:$vB, v4i32:$vC))), 135>;
// NOR
// (nor A, B, C)
- def : XXEvalPattern<(vnot (or v4i32:$vA, (or v4i32:$vB, v4i32:$vC))), 128>;
+ def : XXEvalPattern<v4i32, (vnot (or v4i32:$vA, (or v4i32:$vB, v4i32:$vC))), 128>;
// (nor A, (and B, C))
- def : XXEvalPattern<(vnot (or v4i32:$vA, (and v4i32:$vB, v4i32:$vC))), 224>;
+ def : XXEvalPattern<v4i32, (vnot (or v4i32:$vA, (and v4i32:$vB, v4i32:$vC))), 224>;
// (nor A, (eqv B, C))
- def : XXEvalPattern<(and (vnot v4i32:$vA), (xor v4i32:$vB, v4i32:$vC)), 96>;
+ def : XXEvalPattern<v4i32, (and (vnot v4i32:$vA), (xor v4i32:$vB, v4i32:$vC)), 96>;
// (nor A, (nand B, C))
- def : XXEvalPattern<(and (vnot v4i32:$vA), (and v4i32:$vB, v4i32:$vC)), 16>;
+ def : XXEvalPattern<v4i32, (and (vnot v4i32:$vA), (and v4i32:$vB, v4i32:$vC)), 16>;
// (nor A, (nor B, C))
- def : XXEvalPattern<(and (vnot v4i32:$vA), (or v4i32:$vB, v4i32:$vC)), 112>;
+ def : XXEvalPattern<v4i32, (and (vnot v4i32:$vA), (or v4i32:$vB, v4i32:$vC)), 112>;
// (nor A, (xor B, C))
- def : XXEvalPattern<(vnot (or v4i32:$vA, (xor v4i32:$vB, v4i32:$vC))), 144>;
+ def : XXEvalPattern<v4i32, (vnot (or v4i32:$vA, (xor v4i32:$vB, v4i32:$vC))), 144>;
// OR
// (or A, B, C)
- def : XXEvalPattern<(or v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 127>;
+ def : XXEvalPattern<v4i32, (or v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 127>;
// (or A, (and B, C))
- def : XXEvalPattern<(or v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), 31>;
+ def : XXEvalPattern<v4i32, (or v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), 31>;
// (or A, (eqv B, C))
- def : XXEvalPattern<(or v4i32:$vA, (vnot (xor v4i32:$vB, v4i32:$vC))), 159>;
+ def : XXEvalPattern<v4i32, (or v4i32:$vA, (vnot (xor v4i32:$vB, v4i32:$vC))), 159>;
// (or A, (nand B, C))
- def : XXEvalPattern<(or v4i32:$vA, (vnot (and v4i32:$vB, v4i32:$vC))), 239>;
+ def : XXEvalPattern<v4i32, (or v4i32:$vA, (vnot (and v4i32:$vB, v4i32:$vC))), 239>;
// (or A, (nor B, C))
- def : XXEvalPattern<(or v4i32:$vA, (vnot (or v4i32:$vB, v4i32:$vC))), 143>;
+ def : XXEvalPattern<v4i32, (or v4i32:$vA, (vnot (or v4i32:$vB, v4i32:$vC))), 143>;
// (or A, (xor B, C))
- def : XXEvalPattern<(or v4i32:$vA, (xor v4i32:$vB, v4i32:$vC)), 111>;
+ def : XXEvalPattern<v4i32, (or v4i32:$vA, (xor v4i32:$vB, v4i32:$vC)), 111>;
// XOR
// (xor A, B, C)
- def : XXEvalPattern<(xor v4i32:$vA, (xor v4i32:$vB, v4i32:$vC)), 105>;
+ def : XXEvalPattern<v4i32, (xor v4i32:$vA, (xor v4i32:$vB, v4i32:$vC)), 105>;
// (xor A, (and B, C))
- def : XXEvalPattern<(xor v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), 30>;
+ def : XXEvalPattern<v4i32, (xor v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), 30>;
// (xor A, (or B, C))
- def : XXEvalPattern<(xor v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 120>;
+ def : XXEvalPattern<v4i32, (xor v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 120>;
// XXEval Patterns for ternary Operations.
foreach Ty = [v4i32, v2i64, v8i16, v16i8] in {
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp
index d2b75a6..34026ed 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp
@@ -45,8 +45,8 @@ public:
CCValAssign::LocInfo LocInfo,
const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags,
CCState &State) override {
- if (RISCVAssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State, Info.IsFixed,
- IsRet, Info.Ty))
+ if (RISCVAssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State, IsRet,
+ Info.Ty))
return true;
StackSize = State.getStackSize();
@@ -196,8 +196,8 @@ public:
if (LocVT.isScalableVector())
MF.getInfo<RISCVMachineFunctionInfo>()->setIsVectorCall();
- if (RISCVAssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State,
- /*IsFixed=*/true, IsRet, Info.Ty))
+ if (RISCVAssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State, IsRet,
+ Info.Ty))
return true;
StackSize = State.getStackSize();
@@ -454,7 +454,7 @@ bool RISCVCallLowering::canLowerReturn(MachineFunction &MF,
for (unsigned I = 0, E = Outs.size(); I < E; ++I) {
MVT VT = MVT::getVT(Outs[I].Ty);
if (CC_RISCV(I, VT, VT, CCValAssign::Full, Outs[I].Flags[0], CCInfo,
- /*IsFixed=*/true, /*isRet=*/true, nullptr))
+ /*isRet=*/true, nullptr))
return false;
}
return true;
diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.cpp b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
index cb6117e..70127e3 100644
--- a/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
+++ b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
@@ -324,7 +324,7 @@ static MCRegister allocateRVVReg(MVT ValVT, unsigned ValNo, CCState &State,
// Implements the RISC-V calling convention. Returns true upon failure.
bool llvm::CC_RISCV(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
- CCState &State, bool IsFixed, bool IsRet, Type *OrigTy) {
+ CCState &State, bool IsRet, Type *OrigTy) {
const MachineFunction &MF = State.getMachineFunction();
const DataLayout &DL = MF.getDataLayout();
const RISCVSubtarget &Subtarget = MF.getSubtarget<RISCVSubtarget>();
@@ -379,12 +379,12 @@ bool llvm::CC_RISCV(unsigned ValNo, MVT ValVT, MVT LocVT,
break;
case RISCVABI::ABI_ILP32F:
case RISCVABI::ABI_LP64F:
- UseGPRForF16_F32 = !IsFixed;
+ UseGPRForF16_F32 = ArgFlags.isVarArg();
break;
case RISCVABI::ABI_ILP32D:
case RISCVABI::ABI_LP64D:
- UseGPRForF16_F32 = !IsFixed;
- UseGPRForF64 = !IsFixed;
+ UseGPRForF16_F32 = ArgFlags.isVarArg();
+ UseGPRForF64 = ArgFlags.isVarArg();
break;
}
@@ -465,7 +465,7 @@ bool llvm::CC_RISCV(unsigned ValNo, MVT ValVT, MVT LocVT,
// currently if we are using ILP32E calling convention. This behavior may be
// changed when RV32E/ILP32E is ratified.
unsigned TwoXLenInBytes = (2 * XLen) / 8;
- if (!IsFixed && ArgFlags.getNonZeroOrigAlign() == TwoXLenInBytes &&
+ if (ArgFlags.isVarArg() && ArgFlags.getNonZeroOrigAlign() == TwoXLenInBytes &&
DL.getTypeAllocSize(OrigTy) == TwoXLenInBytes &&
ABI != RISCVABI::ABI_ILP32E) {
unsigned RegIdx = State.getFirstUnallocated(ArgGPRs);
@@ -620,8 +620,8 @@ bool llvm::CC_RISCV(unsigned ValNo, MVT ValVT, MVT LocVT,
// benchmark. But theoretically, it may have benefit for some cases.
bool llvm::CC_RISCV_FastCC(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State,
- bool IsFixed, bool IsRet, Type *OrigTy) {
+ ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsRet,
+ Type *OrigTy) {
const MachineFunction &MF = State.getMachineFunction();
const RISCVSubtarget &Subtarget = MF.getSubtarget<RISCVSubtarget>();
const RISCVTargetLowering &TLI = *Subtarget.getTargetLowering();
diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.h b/llvm/lib/Target/RISCV/RISCVCallingConv.h
index bf823b7..2030ce1 100644
--- a/llvm/lib/Target/RISCV/RISCVCallingConv.h
+++ b/llvm/lib/Target/RISCV/RISCVCallingConv.h
@@ -21,15 +21,15 @@ namespace llvm {
typedef bool RISCVCCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
ISD::ArgFlagsTy ArgFlags, CCState &State,
- bool IsFixed, bool IsRet, Type *OrigTy);
+ bool IsRet, Type *OrigTy);
bool CC_RISCV(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
- CCState &State, bool IsFixed, bool IsRet, Type *OrigTy);
+ CCState &State, bool IsRet, Type *OrigTy);
bool CC_RISCV_FastCC(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
- CCState &State, bool IsFixed, bool IsRet, Type *OrigTy);
+ CCState &State, bool IsRet, Type *OrigTy);
bool CC_RISCV_GHC(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 171940e..a7329d2 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1700,6 +1700,18 @@ def TuneNLogNVRGather
def TunePostRAScheduler : SubtargetFeature<"use-postra-scheduler",
"UsePostRAScheduler", "true", "Schedule again after register allocation">;
+def TuneDisableMISchedLoadClustering : SubtargetFeature<"disable-misched-load-clustering",
+ "EnableMISchedLoadClustering", "false", "Disable load clustering in the machine scheduler">;
+
+def TuneDisableMISchedStoreClustering : SubtargetFeature<"disable-misched-store-clustering",
+ "EnableMISchedStoreClustering", "false", "Disable store clustering in the machine scheduler">;
+
+def TuneDisablePostMISchedLoadClustering : SubtargetFeature<"disable-postmisched-load-clustering",
+ "EnablePostMISchedLoadClustering", "false", "Disable PostRA load clustering in the machine scheduler">;
+
+def TuneDisablePostMISchedStoreClustering : SubtargetFeature<"disable-postmisched-store-clustering",
+ "EnablePostMISchedStoreClustering", "false", "Disable PostRA store clustering in the machine scheduler">;
+
def TuneDisableLatencySchedHeuristic
: SubtargetFeature<"disable-latency-sched-heuristic", "DisableLatencySchedHeuristic", "true",
"Disable latency scheduling heuristic">;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 0077ecf..e4aa8b8 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -22282,8 +22282,8 @@ void RISCVTargetLowering::analyzeInputArgs(
else if (In.isOrigArg())
ArgTy = FType->getParamType(In.getOrigArgIndex());
- if (Fn(Idx, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo,
- /*IsFixed=*/true, IsRet, ArgTy)) {
+ if (Fn(Idx, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo, IsRet,
+ ArgTy)) {
LLVM_DEBUG(dbgs() << "InputArg #" << Idx << " has unhandled type "
<< ArgVT << '\n');
llvm_unreachable(nullptr);
@@ -22300,8 +22300,8 @@ void RISCVTargetLowering::analyzeOutputArgs(
ISD::ArgFlagsTy ArgFlags = Out.Flags;
Type *OrigTy = CLI ? CLI->getArgs()[Out.OrigArgIndex].Ty : nullptr;
- if (Fn(Idx, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo, Out.IsFixed,
- IsRet, OrigTy)) {
+ if (Fn(Idx, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo, IsRet,
+ OrigTy)) {
LLVM_DEBUG(dbgs() << "OutputArg #" << Idx << " has unhandled type "
<< ArgVT << "\n");
llvm_unreachable(nullptr);
@@ -23083,7 +23083,7 @@ bool RISCVTargetLowering::CanLowerReturn(
MVT VT = Outs[i].VT;
ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
if (CC_RISCV(i, VT, VT, CCValAssign::Full, ArgFlags, CCInfo,
- /*IsFixed=*/true, /*IsRet=*/true, nullptr))
+ /*IsRet=*/true, nullptr))
return false;
}
return true;
@@ -24691,7 +24691,7 @@ SDValue RISCVTargetLowering::lowerDYNAMIC_STACKALLOC(SDValue Op,
SP = DAG.getNode(ISD::SUB, dl, XLenVT, SP, Size);
if (Align)
SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
- DAG.getSignedConstant(-(uint64_t)Align->value(), dl, VT));
+ DAG.getSignedConstant(-Align->value(), dl, VT));
// Set the real SP to the new value with a probing loop.
Chain = DAG.getNode(RISCVISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP);
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index d2a6514..413ad8b 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -629,9 +629,6 @@ def : Pat<(or (shl (zexti8 (XLenVT GPR:$rs2)), (XLenVT 8)),
def : Pat<(and (or (shl GPR:$rs2, (XLenVT 8)),
(zexti8 (XLenVT GPR:$rs1))), 0xFFFF),
(PACKH GPR:$rs1, GPR:$rs2)>;
-def : Pat<(or (shl (zexti8 (XLenVT GPR:$rs2)), (XLenVT 24)),
- (shl (zexti8 (XLenVT GPR:$rs1)), (XLenVT 16))),
- (SLLI (XLenVT (PACKH GPR:$rs1, GPR:$rs2)), (XLenVT 16))>;
def : Pat<(binop_allhusers<or> (shl GPR:$rs2, (XLenVT 8)),
(zexti8 (XLenVT GPR:$rs1))),
@@ -641,13 +638,19 @@ def : Pat<(binop_allhusers<or> (shl GPR:$rs2, (XLenVT 8)),
let Predicates = [HasStdExtZbkb, IsRV32] in {
def : Pat<(i32 (or (zexti16 (i32 GPR:$rs1)), (shl GPR:$rs2, (i32 16)))),
(PACK GPR:$rs1, GPR:$rs2)>;
-def : Pat<(or (or
- (shl (zexti8 (XLenVT GPR:$op1rs2)), (XLenVT 24)),
+
+def : Pat<(or (shl GPR:$rs2, (XLenVT 24)),
+ (shl (zexti8 (XLenVT GPR:$rs1)), (XLenVT 16))),
+ (SLLI (XLenVT (PACKH GPR:$rs1, GPR:$rs2)), (XLenVT 16))>;
+
+// Match a pattern of 2 bytes being inserted into bits [31:16], with bits
+// bits [15:0] coming from a zero extended value. We can use pack with packh for
+// bits [31:16]. If bits [15:0] can also be a packh, it can be matched
+// separately.
+def : Pat<(or (or (shl GPR:$op1rs2, (XLenVT 24)),
(shl (zexti8 (XLenVT GPR:$op1rs1)), (XLenVT 16))),
- (or
- (shl (zexti8 (XLenVT GPR:$op0rs2)), (XLenVT 8)),
- (zexti8 (XLenVT GPR:$op0rs1)))),
- (PACK (XLenVT (PACKH GPR:$op0rs1, GPR:$op0rs2)),
+ (zexti16 (XLenVT GPR:$rs1))),
+ (PACK (XLenVT GPR:$rs1),
(XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>;
}
@@ -655,12 +658,40 @@ let Predicates = [HasStdExtZbkb, IsRV64] in {
def : Pat<(i64 (or (zexti32 (i64 GPR:$rs1)), (shl GPR:$rs2, (i64 32)))),
(PACK GPR:$rs1, GPR:$rs2)>;
+def : Pat<(or (shl (zexti8 (XLenVT GPR:$rs2)), (XLenVT 24)),
+ (shl (zexti8 (XLenVT GPR:$rs1)), (XLenVT 16))),
+ (SLLI (XLenVT (PACKH GPR:$rs1, GPR:$rs2)), (XLenVT 16))>;
+def : Pat<(binop_allwusers<or> (shl GPR:$rs2, (XLenVT 24)),
+ (shl (zexti8 (XLenVT GPR:$rs1)), (XLenVT 16))),
+ (SLLI (XLenVT (PACKH GPR:$rs1, GPR:$rs2)), (XLenVT 16))>;
+
def : Pat<(binop_allwusers<or> (shl GPR:$rs2, (i64 16)),
(zexti16 (i64 GPR:$rs1))),
(PACKW GPR:$rs1, GPR:$rs2)>;
def : Pat<(i64 (or (sext_inreg (shl GPR:$rs2, (i64 16)), i32),
(zexti16 (i64 GPR:$rs1)))),
(PACKW GPR:$rs1, GPR:$rs2)>;
+
+// Match a pattern of 2 bytes being inserted into bits [31:16], with bits
+// bits [15:0] coming from a zero extended value, and bits [63:32] being
+// ignored. We can use packw with packh for bits [31:16]. If bits [15:0] can
+// also be a packh, it can be matched separately.
+def : Pat<(binop_allwusers<or>
+ (or (shl GPR:$op1rs2, (XLenVT 24)),
+ (shl (zexti8 (XLenVT GPR:$op1rs1)), (XLenVT 16))),
+ (zexti16 (XLenVT GPR:$rs1))),
+ (PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>;
+// We need to manually reassociate the patterns because of the binop_allwusers.
+def : Pat<(binop_allwusers<or>
+ (or (zexti16 (XLenVT GPR:$rs1)),
+ (shl (zexti8 (XLenVT GPR:$op1rs1)), (XLenVT 16))),
+ (shl GPR:$op1rs2, (XLenVT 24))),
+ (PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>;
+def : Pat<(binop_allwusers<or>
+ (or (zexti16 (XLenVT GPR:$rs1)),
+ (shl GPR:$op1rs1, (XLenVT 24))),
+ (shl (zexti8 (XLenVT GPR:$op1rs2)), (XLenVT 16))),
+ (PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>;
} // Predicates = [HasStdExtZbkb, IsRV64]
let Predicates = [HasStdExtZbb, IsRV32] in
diff --git a/llvm/lib/Target/RISCV/RISCVMacroFusion.td b/llvm/lib/Target/RISCV/RISCVMacroFusion.td
index 875a93d..39e099b 100644
--- a/llvm/lib/Target/RISCV/RISCVMacroFusion.td
+++ b/llvm/lib/Target/RISCV/RISCVMacroFusion.td
@@ -91,3 +91,59 @@ def TuneLDADDFusion
CheckIsImmOperand<2>,
CheckImmOperand<2, 0>
]>>;
+
+defvar Load = [LB, LH, LW, LD, LBU, LHU, LWU];
+
+// Fuse add(.uw) followed by a load (lb, lh, lw, ld, lbu, lhu, lwu):
+// add(.uw) rd, rs1, rs2
+// load rd, imm12(rd)
+def TuneADDLoadFusion
+ : SimpleFusion<"add-load-fusion", "HasADDLoadFusion", "Enable ADD(.UW) + load macrofusion",
+ CheckOpcode<[ADD, ADD_UW]>,
+ CheckOpcode<Load>>;
+
+// Fuse AUIPC followed by by a load (lb, lh, lw, ld, lbu, lhu, lwu)
+// auipc rd, imm20
+// load rd, imm12(rd)
+def TuneAUIPCLoadFusion
+ : SimpleFusion<"auipc-load-fusion", "HasAUIPCLoadFusion",
+ "Enable AUIPC + load macrofusion",
+ CheckOpcode<[AUIPC]>,
+ CheckOpcode<Load>>;
+
+// Fuse LUI followed by a load (lb, lh, lw, ld, lbu, lhu, lwu)
+// lui rd, imm[31:12]
+// load rd, imm12(rd)
+def TuneLUILoadFusion
+ : SimpleFusion<"lui-load-fusion", "HasLUILoadFusion",
+ "Enable LUI + load macrofusion",
+ CheckOpcode<[LUI]>,
+ CheckOpcode<Load>>;
+
+// Bitfield extract fusion: similar to TuneShiftedZExtWFusion
+// but without the immediate restriction
+// slli rd, rs1, imm12
+// srli rd, rd, imm12
+def TuneBFExtFusion
+ : SimpleFusion<"bfext-fusion", "HasBFExtFusion",
+ "Enable SLLI+SRLI (bitfield extract) macrofusion",
+ CheckOpcode<[SLLI]>,
+ CheckOpcode<[SRLI]>>;
+
+// Fuse ADDI followed by a load (lb, lh, lw, ld, lbu, lhu, lwu)
+// addi rd, rs1, imm12
+// load rd, imm12(rd)
+def TuneADDILoadFusion
+ : SimpleFusion<"addi-load-fusion", "HasADDILoadFusion",
+ "Enable ADDI + load macrofusion",
+ CheckOpcode<[ADDI]>,
+ CheckOpcode<Load>>;
+
+// Fuse shXadd(.uw) followed by a load (lb, lh, lw, ld, lbu, lhu, lwu)
+// shXadd(.uw) rd, rs1, rs2
+// load rd, imm12(rd)
+def TuneSHXADDLoadFusion
+ : SimpleFusion<"shxadd-load-fusion", "HasSHXADDLoadFusion",
+ "Enable SH(1|2|3)ADD(.UW) + load macrofusion",
+ CheckOpcode<[SH1ADD, SH2ADD, SH3ADD, SH1ADD_UW, SH2ADD_UW, SH3ADD_UW]>,
+ CheckOpcode<Load>>;
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index 838edf6..31d2b3a 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -590,12 +590,17 @@ def VENTANA_VEYRON_V1 : RISCVProcessorModel<"veyron-v1",
FeatureStdExtZicboz,
FeatureVendorXVentanaCondOps],
[TuneVentanaVeyron,
+ TuneDisableMISchedLoadClustering,
+ TuneDisablePostMISchedLoadClustering,
+ TuneDisablePostMISchedStoreClustering,
TuneLUIADDIFusion,
TuneAUIPCADDIFusion,
TuneZExtHFusion,
TuneZExtWFusion,
TuneShiftedZExtWFusion,
- TuneLDADDFusion]> {
+ TuneADDLoadFusion,
+ TuneAUIPCLoadFusion,
+ TuneLUILoadFusion]> {
let MVendorID = 0x61f;
let MArchID = 0x8000000000010000;
let MImpID = 0x111;
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
index bf23812..5541506 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
@@ -13,78 +13,113 @@
//
//===----------------------------------------------------------------------===//
-class SMX60IsWorstCaseMX<string mx, list<string> MxList> {
- string LLMUL = LargestLMUL<MxList>.r;
- bit c = !eq(mx, LLMUL);
-}
+//===----------------------------------------------------------------------===//
+// Helpers
+
+// Maps LMUL string to corresponding value from the Values array
+// LMUL values map to array indices as follows:
+// MF8 -> Values[0], MF4 -> Values[1], MF2 -> Values[2], M1 -> Values[3],
+// M2 -> Values[4], M4 -> Values[5], M8 -> Values[6]
+// Shorter lists are allowed, e.g., widening instructions don't work on M8
+class GetLMULValue<list<int> Values, string LMUL> {
+ defvar Index = !cond(
+ !eq(LMUL, "MF8"): 0,
+ !eq(LMUL, "MF4"): 1,
+ !eq(LMUL, "MF2"): 2,
+ !eq(LMUL, "M1"): 3,
+ !eq(LMUL, "M2"): 4,
+ !eq(LMUL, "M4"): 5,
+ !eq(LMUL, "M8"): 6,
+ );
-class SMX60IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit isF = 0> {
- string LLMUL = LargestLMUL<MxList>.r;
- int SSEW = SmallestSEW<mx, isF>.r;
- bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW));
+ assert !lt(Index, !size(Values)),
+ "Missing LMUL value for '" # LMUL # "'. " #
+ "Expected at least " # !add(Index, 1) # " elements, but got " #
+ !size(Values) # ".";
+
+ int c = Values[Index];
}
-defvar SMX60VLEN = 256;
-defvar SMX60DLEN = !div(SMX60VLEN, 2);
+// Returns BaseValue for LMUL values before startLMUL, Value for startLMUL,
+// then doubles Value for each subsequent LMUL
+// Example: ConstValueUntilLMULThenDoubleBase<"M1", 2, 4, "M8"> returns:
+// MF8->2, MF4->2, MF2->2, M1->4, M2->8, M4->16, M8->32
+// This is useful for modeling scheduling parameters that scale with LMUL.
+class ConstValueUntilLMULThenDoubleBase<string startLMUL, int BaseValue, int Value, string currentLMUL> {
+ assert !le(BaseValue, Value), "BaseValue must be less-equal to Value";
+ defvar startPos = GetLMULValue<[0, 1, 2, 3, 4, 5, 6], startLMUL>.c;
+ defvar currentPos = GetLMULValue<[0, 1, 2, 3, 4, 5, 6], currentLMUL>.c;
-class Get1248Latency<string mx> {
+ // Calculate the difference in positions
+ defvar posDiff = !sub(currentPos, startPos);
+
+ // Calculate Value * (2^posDiff)
int c = !cond(
- !eq(mx, "M2") : 2,
- !eq(mx, "M4") : 4,
- !eq(mx, "M8") : 8,
- true: 1
+ !eq(posDiff, 0) : Value,
+ !eq(posDiff, 1) : !mul(Value, 2),
+ !eq(posDiff, 2) : !mul(Value, 4),
+ !eq(posDiff, 3) : !mul(Value, 8),
+ !eq(posDiff, 4) : !mul(Value, 16),
+ !eq(posDiff, 5) : !mul(Value, 32),
+ !eq(posDiff, 6) : !mul(Value, 64),
+ true : BaseValue
);
}
-// Used for: logical opsz, shifts, sign ext, merge/move, FP sign/recip/convert, mask ops, slides
-class Get4816Latency<string mx> {
- int c = !cond(
- !eq(mx, "M4") : 8,
- !eq(mx, "M8") : 16,
- true: 4
- );
+// Same as the previous function but BaseValue == Value
+class ConstValueUntilLMULThenDouble<string startLMUL, int Value, string currentLMUL> {
+ int c = ConstValueUntilLMULThenDoubleBase<startLMUL, Value, Value, currentLMUL>.c;
+}
+
+// Returns MF8->1, MF4->1, MF2->2, M1->4, M2->8, M4->16, M8->32
+class ConstOneUntilMF4ThenDouble<string mx> {
+ int c = ConstValueUntilLMULThenDouble<"MF4", 1, mx>.c;
+}
+
+// Returns MF8->1, MF4->1, MF2->1, M1->2, M2->4, M4->8, M8->16
+class ConstOneUntilMF2ThenDouble<string mx> {
+ int c = ConstValueUntilLMULThenDouble<"MF2", 1, mx>.c;
+}
+
+// Returns MF8->1, MF4->1, MF2->1, M1->1, M2->2, M4->4, M8->8
+class ConstOneUntilM1ThenDouble<string mx> {
+ int c = ConstValueUntilLMULThenDouble<"M1", 1, mx>.c;
}
+//===----------------------------------------------------------------------===//
+// Latency helper classes
+
// Used for: arithmetic (add/sub/min/max), saturating/averaging, FP add/sub/min/max
-class Get458Latency<string mx> {
- int c = !cond(
- !eq(mx, "M4") : 5,
- !eq(mx, "M8") : 8,
- true: 4
- );
+class Get4458Latency<string mx> {
+ int c = GetLMULValue<[/*MF8=*/4, /*MF4=*/4, /*MF2=*/4, /*M1=*/4, /*M2=*/4, /*M4=*/5, /*M8=*/8], mx>.c;
}
-// Widening scaling pattern (4,4,4,4,5,8,8): plateaus at higher LMULs
-// Used for: widening operations
+// Used for: widening operations (no M8)
class Get4588Latency<string mx> {
- int c = !cond(
- !eq(mx, "M2") : 5,
- !eq(mx, "M4") : 8,
- !eq(mx, "M8") : 8, // M8 not supported for most widening, fallback
- true: 4
- );
+ int c = GetLMULValue<[/*MF8=*/4, /*MF4=*/4, /*MF2=*/4, /*M1=*/4, /*M2=*/5, /*M4=*/8], mx>.c;
}
// Used for: mask-producing comparisons, carry ops with mask, FP comparisons
class Get461018Latency<string mx> {
- int c = !cond(
- !eq(mx, "M2") : 6,
- !eq(mx, "M4") : 10,
- !eq(mx, "M8") : 18,
- true: 4
- );
+ int c = GetLMULValue<[/*MF8=*/4, /*MF4=*/4, /*MF2=*/4, /*M1=*/4, /*M2=*/6, /*M4=*/10, /*M8=*/18], mx>.c;
}
-// Used for: e64 multiply pattern, complex ops
-class Get781632Latency<string mx> {
- int c = !cond(
- !eq(mx, "M2") : 8,
- !eq(mx, "M4") : 16,
- !eq(mx, "M8") : 32,
- true: 7
- );
+//===----------------------------------------------------------------------===//
+
+class SMX60IsWorstCaseMX<string mx, list<string> MxList> {
+ string LLMUL = LargestLMUL<MxList>.r;
+ bit c = !eq(mx, LLMUL);
}
+class SMX60IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit isF = 0> {
+ string LLMUL = LargestLMUL<MxList>.r;
+ int SSEW = SmallestSEW<mx, isF>.r;
+ bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW));
+}
+
+defvar SMX60VLEN = 256;
+defvar SMX60DLEN = !div(SMX60VLEN, 2);
+
def SpacemitX60Model : SchedMachineModel {
let IssueWidth = 2; // dual-issue
let MicroOpBufferSize = 0; // in-order
@@ -383,12 +418,13 @@ foreach LMul = [1, 2, 4, 8] in {
foreach mx = SchedMxList in {
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
- let Latency = Get458Latency<mx>.c, ReleaseAtCycles = [4] in {
+ let Latency = Get4458Latency<mx>.c, ReleaseAtCycles = [4] in {
defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SMX60_VIEU], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SMX60_VIEU], mx, IsWorstCase>;
}
- let Latency = Get4816Latency<mx>.c, ReleaseAtCycles = [4] in {
+ defvar VIALULat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c;
+ let Latency = VIALULat, ReleaseAtCycles = [4] in {
// Pattern of vadd, vsub, vrsub: 4/4/5/8
// Pattern of vand, vor, vxor: 4/4/8/16
// They are grouped together, so we used the worst case 4/4/8/16
@@ -425,7 +461,7 @@ foreach mx = SchedMxList in {
// Pattern of vmacc, vmadd, vmul, vmulh, etc.: e8/e16 = 4/4/5/8, e32 = 5,5,5,8,
// e64 = 7,8,16,32. We use the worst-case until we can split the SEW.
// TODO: change WriteVIMulV, etc to be defined with LMULSEWSchedWrites
- let Latency = Get781632Latency<mx>.c, ReleaseAtCycles = [7] in {
+ let Latency = ConstValueUntilLMULThenDoubleBase<"M2", 7, 8, mx>.c, ReleaseAtCycles = [7] in {
defm "" : LMULWriteResMX<"WriteVIMulV", [SMX60_VIEU], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVIMulX", [SMX60_VIEU], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVIMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
@@ -461,15 +497,8 @@ foreach mx = SchedMxList in {
foreach sew = SchedSEWSet<mx>.val in {
defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
- // Slightly reduced for fractional LMULs
- defvar Multiplier = !cond(
- !eq(mx, "MF8") : 12,
- !eq(mx, "MF4") : 12,
- !eq(mx, "MF2") : 12,
- true: 24
- );
-
- let Latency = !mul(Get1248Latency<mx>.c, Multiplier), ReleaseAtCycles = [12] in {
+ defvar VIDivLat = ConstValueUntilLMULThenDouble<"MF2", 12, mx>.c;
+ let Latency = VIDivLat, ReleaseAtCycles = [12] in {
defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [SMX60_VIEU], mx, sew, IsWorstCase>;
defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [SMX60_VIEU], mx, sew, IsWorstCase>;
}
@@ -480,14 +509,8 @@ foreach mx = SchedMxList in {
foreach mx = SchedMxListW in {
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c;
- // Slightly increased for integer LMULs
- defvar Multiplier = !cond(
- !eq(mx, "M2") : 2,
- !eq(mx, "M4") : 2,
- true: 1
- );
-
- let Latency = !mul(Get4816Latency<mx>.c, Multiplier), ReleaseAtCycles = [4] in {
+ defvar VNarrowingLat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c;
+ let Latency = VNarrowingLat, ReleaseAtCycles = [4] in {
defm "" : LMULWriteResMX<"WriteVNShiftV", [SMX60_VIEU], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVNShiftX", [SMX60_VIEU], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVNShiftI", [SMX60_VIEU], mx, IsWorstCase>;
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 3f2a83f..66ce134 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -94,16 +94,6 @@ static cl::opt<bool>
cl::desc("Enable the loop data prefetch pass"),
cl::init(true));
-static cl::opt<bool> EnableMISchedLoadStoreClustering(
- "riscv-misched-load-store-clustering", cl::Hidden,
- cl::desc("Enable load and store clustering in the machine scheduler"),
- cl::init(true));
-
-static cl::opt<bool> EnablePostMISchedLoadStoreClustering(
- "riscv-postmisched-load-store-clustering", cl::Hidden,
- cl::desc("Enable PostRA load and store clustering in the machine scheduler"),
- cl::init(true));
-
static cl::opt<bool> DisableVectorMaskMutation(
"riscv-disable-vector-mask-mutation",
cl::desc("Disable the vector mask scheduling mutation"), cl::init(false),
@@ -294,15 +284,17 @@ bool RISCVTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
ScheduleDAGInstrs *
RISCVTargetMachine::createMachineScheduler(MachineSchedContext *C) const {
+ const RISCVSubtarget &ST = C->MF->getSubtarget<RISCVSubtarget>();
ScheduleDAGMILive *DAG = createSchedLive(C);
- if (EnableMISchedLoadStoreClustering) {
+
+ if (ST.enableMISchedLoadClustering())
DAG->addMutation(createLoadClusterDAGMutation(
DAG->TII, DAG->TRI, /*ReorderWhileClustering=*/true));
+
+ if (ST.enableMISchedStoreClustering())
DAG->addMutation(createStoreClusterDAGMutation(
DAG->TII, DAG->TRI, /*ReorderWhileClustering=*/true));
- }
- const RISCVSubtarget &ST = C->MF->getSubtarget<RISCVSubtarget>();
if (!DisableVectorMaskMutation && ST.hasVInstructions())
DAG->addMutation(createRISCVVectorMaskDAGMutation(DAG->TRI));
@@ -311,13 +303,16 @@ RISCVTargetMachine::createMachineScheduler(MachineSchedContext *C) const {
ScheduleDAGInstrs *
RISCVTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const {
+ const RISCVSubtarget &ST = C->MF->getSubtarget<RISCVSubtarget>();
ScheduleDAGMI *DAG = createSchedPostRA(C);
- if (EnablePostMISchedLoadStoreClustering) {
+
+ if (ST.enablePostMISchedLoadClustering())
DAG->addMutation(createLoadClusterDAGMutation(
DAG->TII, DAG->TRI, /*ReorderWhileClustering=*/true));
+
+ if (ST.enablePostMISchedStoreClustering())
DAG->addMutation(createStoreClusterDAGMutation(
DAG->TII, DAG->TRI, /*ReorderWhileClustering=*/true));
- }
return DAG;
}
diff --git a/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.h b/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.h
index 78a066b..ed0a1e1 100644
--- a/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.h
+++ b/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.h
@@ -73,7 +73,11 @@ public:
Entry(std::move(CR.Entry)), Exits(std::move(CR.Exits)),
Blocks(std::move(CR.Blocks)) {}
+ ~ConvergenceRegion() { releaseMemory(); }
+
+ ConvergenceRegion &operator=(ConvergenceRegion &&CR) = delete;
ConvergenceRegion(const ConvergenceRegion &other) = delete;
+ ConvergenceRegion &operator=(const ConvergenceRegion &other) = delete;
// Returns true if the given basic block belongs to this region, or to one of
// its subregion.
@@ -101,6 +105,9 @@ public:
~ConvergenceRegionInfo() { releaseMemory(); }
+ ConvergenceRegionInfo(const ConvergenceRegionInfo &LHS) = delete;
+ ConvergenceRegionInfo &operator=(const ConvergenceRegionInfo &LHS) = delete;
+
ConvergenceRegionInfo(ConvergenceRegionInfo &&LHS)
: TopLevelRegion(LHS.TopLevelRegion) {
if (TopLevelRegion != LHS.TopLevelRegion) {
diff --git a/llvm/lib/Target/SPIRV/CMakeLists.txt b/llvm/lib/Target/SPIRV/CMakeLists.txt
index ba09451..6660de9 100644
--- a/llvm/lib/Target/SPIRV/CMakeLists.txt
+++ b/llvm/lib/Target/SPIRV/CMakeLists.txt
@@ -26,6 +26,7 @@ add_llvm_target(SPIRVCodeGen
SPIRVGlobalRegistry.cpp
SPIRVInstrInfo.cpp
SPIRVInstructionSelector.cpp
+ SPIRVLegalizeImplicitBinding.cpp
SPIRVStripConvergentIntrinsics.cpp
SPIRVLegalizePointerCast.cpp
SPIRVMergeRegionExitTargets.cpp
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
index 64d301e..4ec31bf 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
@@ -96,7 +96,7 @@ void SPIRVInstPrinter::printOpConstantVarOps(const MCInst *MI,
void SPIRVInstPrinter::recordOpExtInstImport(const MCInst *MI) {
MCRegister Reg = MI->getOperand(0).getReg();
auto Name = getSPIRVStringOperand(*MI, 1);
- auto Set = getExtInstSetFromString(Name);
+ auto Set = getExtInstSetFromString(std::move(Name));
ExtInstSetIDs.insert({Reg, Set});
}
@@ -210,6 +210,7 @@ void SPIRVInstPrinter::printInst(const MCInst *MI, uint64_t Address,
case SPIRV::OpConstantF:
// The last fixed operand along with any variadic operands that follow
// are part of the variable value.
+ assert(NumFixedOps > 0 && "Expected at least one fixed operand");
printOpConstantVarOps(MI, NumFixedOps - 1, OS);
break;
case SPIRV::OpCooperativeMatrixMulAddKHR: {
diff --git a/llvm/lib/Target/SPIRV/SPIRV.h b/llvm/lib/Target/SPIRV/SPIRV.h
index 1688fa3..1934e98 100644
--- a/llvm/lib/Target/SPIRV/SPIRV.h
+++ b/llvm/lib/Target/SPIRV/SPIRV.h
@@ -23,6 +23,7 @@ ModulePass *createSPIRVPrepareFunctionsPass(const SPIRVTargetMachine &TM);
FunctionPass *createSPIRVStructurizerPass();
FunctionPass *createSPIRVMergeRegionExitTargetsPass();
FunctionPass *createSPIRVStripConvergenceIntrinsicsPass();
+ModulePass *createSPIRVLegalizeImplicitBindingPass();
FunctionPass *createSPIRVLegalizePointerCastPass(SPIRVTargetMachine *TM);
FunctionPass *createSPIRVRegularizerPass();
FunctionPass *createSPIRVPreLegalizerCombiner();
@@ -49,6 +50,7 @@ void initializeSPIRVRegularizerPass(PassRegistry &);
void initializeSPIRVMergeRegionExitTargetsPass(PassRegistry &);
void initializeSPIRVPrepareFunctionsPass(PassRegistry &);
void initializeSPIRVStripConvergentIntrinsicsPass(PassRegistry &);
+void initializeSPIRVLegalizeImplicitBindingPass(PassRegistry &);
} // namespace llvm
#endif // LLVM_LIB_TARGET_SPIRV_SPIRV_H
diff --git a/llvm/lib/Target/SPIRV/SPIRVAPI.cpp b/llvm/lib/Target/SPIRV/SPIRVAPI.cpp
index cfe7ef4..d6581b2 100644
--- a/llvm/lib/Target/SPIRV/SPIRVAPI.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVAPI.cpp
@@ -156,7 +156,7 @@ SPIRVTranslateModule(Module *M, std::string &SpirvObj, std::string &ErrMsg,
}
}
return SPIRVTranslate(M, SpirvObj, ErrMsg, AllowExtNames, OLevel,
- TargetTriple);
+ std::move(TargetTriple));
}
} // namespace llvm
diff --git a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
index 1ebfde2..c2a6e51 100644
--- a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
@@ -50,7 +50,8 @@ class SPIRVAsmPrinter : public AsmPrinter {
public:
explicit SPIRVAsmPrinter(TargetMachine &TM,
std::unique_ptr<MCStreamer> Streamer)
- : AsmPrinter(TM, std::move(Streamer), ID), ST(nullptr), TII(nullptr) {}
+ : AsmPrinter(TM, std::move(Streamer), ID), ModuleSectionsEmitted(false),
+ ST(nullptr), TII(nullptr), MAI(nullptr) {}
static char ID;
bool ModuleSectionsEmitted;
const SPIRVSubtarget *ST;
@@ -591,7 +592,9 @@ void SPIRVAsmPrinter::outputAnnotations(const Module &M) {
cast<GlobalVariable>(CS->getOperand(1)->stripPointerCasts());
StringRef AnnotationString;
- getConstantStringInfo(GV, AnnotationString);
+ [[maybe_unused]] bool Success =
+ getConstantStringInfo(GV, AnnotationString);
+ assert(Success && "Failed to get annotation string");
MCInst Inst;
Inst.setOpcode(SPIRV::OpDecorate);
Inst.addOperand(MCOperand::createReg(Reg));
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
index 25cdf72..e6e86b7 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
@@ -51,7 +51,7 @@ struct IncomingCall {
IncomingCall(const std::string BuiltinName, const DemangledBuiltin *Builtin,
const Register ReturnRegister, const SPIRVType *ReturnType,
const SmallVectorImpl<Register> &Arguments)
- : BuiltinName(BuiltinName), Builtin(Builtin),
+ : BuiltinName(std::move(BuiltinName)), Builtin(Builtin),
ReturnRegister(ReturnRegister), ReturnType(ReturnType),
Arguments(Arguments) {}
@@ -2619,6 +2619,7 @@ static bool generateConvertInst(const StringRef DemangledCall,
GR->getSPIRVTypeID(Call->ReturnType));
}
+ assert(Builtin && "Conversion builtin not found.");
if (Builtin->IsSaturated)
buildOpDecorate(Call->ReturnRegister, MIRBuilder,
SPIRV::Decoration::SaturatedConversion, {});
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index 2c3e087..f5a49e2 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -499,7 +499,7 @@ void SPIRVEmitIntrinsics::propagateElemTypeRec(
std::unordered_set<Value *> Visited;
DenseMap<Function *, CallInst *> Ptrcasts;
propagateElemTypeRec(Op, PtrElemTy, CastElemTy, VisitedSubst, Visited,
- Ptrcasts);
+ std::move(Ptrcasts));
}
void SPIRVEmitIntrinsics::propagateElemTypeRec(
@@ -897,17 +897,16 @@ Type *SPIRVEmitIntrinsics::deduceNestedTypeHelper(
bool Change = false;
for (unsigned i = 0; i < U->getNumOperands(); ++i) {
Value *Op = U->getOperand(i);
+ assert(Op && "Operands should not be null.");
Type *OpTy = Op->getType();
Type *Ty = OpTy;
- if (Op) {
- if (auto *PtrTy = dyn_cast<PointerType>(OpTy)) {
- if (Type *NestedTy =
- deduceElementTypeHelper(Op, Visited, UnknownElemTypeI8))
- Ty = getTypedPointerWrapper(NestedTy, PtrTy->getAddressSpace());
- } else {
- Ty = deduceNestedTypeHelper(dyn_cast<User>(Op), OpTy, Visited,
- UnknownElemTypeI8);
- }
+ if (auto *PtrTy = dyn_cast<PointerType>(OpTy)) {
+ if (Type *NestedTy =
+ deduceElementTypeHelper(Op, Visited, UnknownElemTypeI8))
+ Ty = getTypedPointerWrapper(NestedTy, PtrTy->getAddressSpace());
+ } else {
+ Ty = deduceNestedTypeHelper(dyn_cast<User>(Op), OpTy, Visited,
+ UnknownElemTypeI8);
}
Tys.push_back(Ty);
Change |= Ty != OpTy;
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitNonSemanticDI.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitNonSemanticDI.cpp
index 7f0d636..275463e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitNonSemanticDI.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitNonSemanticDI.cpp
@@ -116,6 +116,7 @@ bool SPIRVEmitNonSemanticDI::emitGlobalDI(MachineFunction &MF) {
}
}
const NamedMDNode *ModuleFlags = M->getNamedMetadata("llvm.module.flags");
+ assert(ModuleFlags && "Expected llvm.module.flags metadata to be present");
for (const auto *Op : ModuleFlags->operands()) {
const MDOperand &MaybeStrOp = Op->getOperand(1);
if (MaybeStrOp.equalsStr("Dwarf Version"))
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
index f1436d5..cfe24c8 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
@@ -87,7 +87,7 @@ storageClassRequiresExplictLayout(SPIRV::StorageClass::StorageClass SC) {
}
SPIRVGlobalRegistry::SPIRVGlobalRegistry(unsigned PointerSize)
- : PointerSize(PointerSize), Bound(0) {}
+ : PointerSize(PointerSize), Bound(0), CurMF(nullptr) {}
SPIRVType *SPIRVGlobalRegistry::assignIntTypeToVReg(unsigned BitWidth,
Register VReg,
@@ -474,8 +474,8 @@ Register SPIRVGlobalRegistry::getOrCreateBaseRegister(
}
if (Type->getOpcode() == SPIRV::OpTypeFloat) {
SPIRVType *SpvBaseType = getOrCreateSPIRVFloatType(BitWidth, I, TII);
- return getOrCreateConstFP(dyn_cast<ConstantFP>(Val)->getValue(), I,
- SpvBaseType, TII, ZeroAsNull);
+ return getOrCreateConstFP(cast<ConstantFP>(Val)->getValue(), I, SpvBaseType,
+ TII, ZeroAsNull);
}
assert(Type->getOpcode() == SPIRV::OpTypeInt);
SPIRVType *SpvBaseType = getOrCreateSPIRVIntegerType(BitWidth, I, TII);
@@ -1069,7 +1069,8 @@ SPIRVType *SPIRVGlobalRegistry::createSPIRVType(
MIRBuilder);
};
}
- return getOpTypeStruct(SType, MIRBuilder, AccQual, Decorator, EmitIR);
+ return getOpTypeStruct(SType, MIRBuilder, AccQual, std::move(Decorator),
+ EmitIR);
}
if (auto FType = dyn_cast<FunctionType>(Ty)) {
SPIRVType *RetTy = findSPIRVType(FType->getReturnType(), MIRBuilder,
@@ -1406,8 +1407,9 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateLayoutType(
// We need a new OpTypeStruct instruction because decorations will be
// different from a struct with an explicit layout created from a different
// entry point.
- SPIRVType *SPIRVStructType = getOpTypeStruct(
- ST, MIRBuilder, SPIRV::AccessQualifier::None, Decorator, EmitIr);
+ SPIRVType *SPIRVStructType =
+ getOpTypeStruct(ST, MIRBuilder, SPIRV::AccessQualifier::None,
+ std::move(Decorator), EmitIr);
add(Key, SPIRVStructType);
return SPIRVStructType;
}
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index e9f5ffa..5259db1 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -362,6 +362,7 @@ SPIRVInstructionSelector::SPIRVInstructionSelector(const SPIRVTargetMachine &TM,
const RegisterBankInfo &RBI)
: InstructionSelector(), STI(ST), TII(*ST.getInstrInfo()),
TRI(*ST.getRegisterInfo()), RBI(RBI), GR(*ST.getSPIRVGlobalRegistry()),
+ MRI(nullptr),
#define GET_GLOBALISEL_PREDICATES_INIT
#include "SPIRVGenGlobalISel.inc"
#undef GET_GLOBALISEL_PREDICATES_INIT
@@ -3574,7 +3575,7 @@ bool SPIRVInstructionSelector::selectFirstBitSet64Overflow(
// Join all the resulting registers back into the return type in order
// (ie i32x2, i32x2, i32x1 -> i32x5)
- return selectOpWithSrcs(ResVReg, ResType, I, PartialRegs,
+ return selectOpWithSrcs(ResVReg, ResType, I, std::move(PartialRegs),
SPIRV::OpCompositeConstruct);
}
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp
new file mode 100644
index 0000000..0398e52
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp
@@ -0,0 +1,159 @@
+//===- SPIRVLegalizeImplicitBinding.cpp - Legalize implicit bindings ----*- C++
+//-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass legalizes the @llvm.spv.resource.handlefromimplicitbinding
+// intrinsic by replacing it with a call to
+// @llvm.spv.resource.handlefrombinding.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPIRV.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsSPIRV.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include <algorithm>
+#include <vector>
+
+using namespace llvm;
+
+namespace {
+class SPIRVLegalizeImplicitBinding : public ModulePass {
+public:
+ static char ID;
+ SPIRVLegalizeImplicitBinding() : ModulePass(ID) {}
+
+ bool runOnModule(Module &M) override;
+
+private:
+ void collectBindingInfo(Module &M);
+ uint32_t getAndReserveFirstUnusedBinding(uint32_t DescSet);
+ void replaceImplicitBindingCalls(Module &M);
+
+ // A map from descriptor set to a bit vector of used binding numbers.
+ std::vector<BitVector> UsedBindings;
+ // A list of all implicit binding calls, to be sorted by order ID.
+ SmallVector<CallInst *, 16> ImplicitBindingCalls;
+};
+
+struct BindingInfoCollector : public InstVisitor<BindingInfoCollector> {
+ std::vector<BitVector> &UsedBindings;
+ SmallVector<CallInst *, 16> &ImplicitBindingCalls;
+
+ BindingInfoCollector(std::vector<BitVector> &UsedBindings,
+ SmallVector<CallInst *, 16> &ImplicitBindingCalls)
+ : UsedBindings(UsedBindings), ImplicitBindingCalls(ImplicitBindingCalls) {
+ }
+
+ void visitCallInst(CallInst &CI) {
+ if (CI.getIntrinsicID() == Intrinsic::spv_resource_handlefrombinding) {
+ const uint32_t DescSet =
+ cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
+ const uint32_t Binding =
+ cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
+
+ if (UsedBindings.size() <= DescSet) {
+ UsedBindings.resize(DescSet + 1);
+ UsedBindings[DescSet].resize(64);
+ }
+ if (UsedBindings[DescSet].size() <= Binding) {
+ UsedBindings[DescSet].resize(2 * Binding + 1);
+ }
+ UsedBindings[DescSet].set(Binding);
+ } else if (CI.getIntrinsicID() ==
+ Intrinsic::spv_resource_handlefromimplicitbinding) {
+ ImplicitBindingCalls.push_back(&CI);
+ }
+ }
+};
+
+void SPIRVLegalizeImplicitBinding::collectBindingInfo(Module &M) {
+ BindingInfoCollector InfoCollector(UsedBindings, ImplicitBindingCalls);
+ InfoCollector.visit(M);
+
+ // Sort the collected calls by their order ID.
+ std::sort(
+ ImplicitBindingCalls.begin(), ImplicitBindingCalls.end(),
+ [](const CallInst *A, const CallInst *B) {
+ const uint32_t OrderIdArgIdx = 0;
+ const uint32_t OrderA =
+ cast<ConstantInt>(A->getArgOperand(OrderIdArgIdx))->getZExtValue();
+ const uint32_t OrderB =
+ cast<ConstantInt>(B->getArgOperand(OrderIdArgIdx))->getZExtValue();
+ return OrderA < OrderB;
+ });
+}
+
+uint32_t SPIRVLegalizeImplicitBinding::getAndReserveFirstUnusedBinding(
+ uint32_t DescSet) {
+ if (UsedBindings.size() <= DescSet) {
+ UsedBindings.resize(DescSet + 1);
+ UsedBindings[DescSet].resize(64);
+ }
+
+ int NewBinding = UsedBindings[DescSet].find_first_unset();
+ if (NewBinding == -1) {
+ NewBinding = UsedBindings[DescSet].size();
+ UsedBindings[DescSet].resize(2 * NewBinding + 1);
+ }
+
+ UsedBindings[DescSet].set(NewBinding);
+ return NewBinding;
+}
+
+void SPIRVLegalizeImplicitBinding::replaceImplicitBindingCalls(Module &M) {
+ for (CallInst *OldCI : ImplicitBindingCalls) {
+ IRBuilder<> Builder(OldCI);
+ const uint32_t DescSet =
+ cast<ConstantInt>(OldCI->getArgOperand(1))->getZExtValue();
+ const uint32_t NewBinding = getAndReserveFirstUnusedBinding(DescSet);
+
+ SmallVector<Value *, 8> Args;
+ Args.push_back(Builder.getInt32(DescSet));
+ Args.push_back(Builder.getInt32(NewBinding));
+
+ // Copy the remaining arguments from the old call.
+ for (uint32_t i = 2; i < OldCI->arg_size(); ++i) {
+ Args.push_back(OldCI->getArgOperand(i));
+ }
+
+ Function *NewFunc = Intrinsic::getOrInsertDeclaration(
+ &M, Intrinsic::spv_resource_handlefrombinding, OldCI->getType());
+ CallInst *NewCI = Builder.CreateCall(NewFunc, Args);
+ NewCI->setCallingConv(OldCI->getCallingConv());
+
+ OldCI->replaceAllUsesWith(NewCI);
+ OldCI->eraseFromParent();
+ }
+}
+
+bool SPIRVLegalizeImplicitBinding::runOnModule(Module &M) {
+ collectBindingInfo(M);
+ if (ImplicitBindingCalls.empty()) {
+ return false;
+ }
+
+ replaceImplicitBindingCalls(M);
+ return true;
+}
+} // namespace
+
+char SPIRVLegalizeImplicitBinding::ID = 0;
+
+INITIALIZE_PASS(SPIRVLegalizeImplicitBinding, "legalize-spirv-implicit-binding",
+ "Legalize SPIR-V implicit bindings", false, false)
+
+ModulePass *llvm::createSPIRVLegalizeImplicitBindingPass() {
+ return new SPIRVLegalizeImplicitBinding();
+} \ No newline at end of file
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index ab06fc0..8039cf0 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -93,7 +93,7 @@ getSymbolicOperandRequirements(SPIRV::OperandCategory::OperandCategory Category,
if (Reqs.isCapabilityAvailable(Cap)) {
ReqExts.append(getSymbolicOperandExtensions(
SPIRV::OperandCategory::CapabilityOperand, Cap));
- return {true, {Cap}, ReqExts, ReqMinVer, ReqMaxVer};
+ return {true, {Cap}, std::move(ReqExts), ReqMinVer, ReqMaxVer};
}
} else {
// By SPIR-V specification: "If an instruction, enumerant, or other
@@ -111,7 +111,7 @@ getSymbolicOperandRequirements(SPIRV::OperandCategory::OperandCategory Category,
if (i == Sz - 1 || !AvoidCaps.S.contains(Cap)) {
ReqExts.append(getSymbolicOperandExtensions(
SPIRV::OperandCategory::CapabilityOperand, Cap));
- return {true, {Cap}, ReqExts, ReqMinVer, ReqMaxVer};
+ return {true, {Cap}, std::move(ReqExts), ReqMinVer, ReqMaxVer};
}
}
}
@@ -558,7 +558,7 @@ static void collectOtherInstr(MachineInstr &MI, SPIRV::ModuleAnalysisInfo &MAI,
bool Append = true) {
MAI.setSkipEmission(&MI);
InstrSignature MISign = instrToSignature(MI, MAI, true);
- auto FoundMI = IS.insert(MISign);
+ auto FoundMI = IS.insert(std::move(MISign));
if (!FoundMI.second)
return; // insert failed, so we found a duplicate; don't add it to MAI.MS
// No duplicates, so add it.
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
index a0d47cb..41c792a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
@@ -54,8 +54,8 @@ struct Requirements {
std::optional<Capability::Capability> Cap = {},
ExtensionList Exts = {}, VersionTuple MinVer = VersionTuple(),
VersionTuple MaxVer = VersionTuple())
- : IsSatisfiable(IsSatisfiable), Cap(Cap), Exts(Exts), MinVer(MinVer),
- MaxVer(MaxVer) {}
+ : IsSatisfiable(IsSatisfiable), Cap(Cap), Exts(std::move(Exts)),
+ MinVer(MinVer), MaxVer(MaxVer) {}
Requirements(Capability::Capability Cap) : Requirements(true, {Cap}) {}
};
@@ -217,7 +217,8 @@ struct SPIRVModuleAnalysis : public ModulePass {
static char ID;
public:
- SPIRVModuleAnalysis() : ModulePass(ID) {}
+ SPIRVModuleAnalysis()
+ : ModulePass(ID), ST(nullptr), GR(nullptr), TII(nullptr), MMI(nullptr) {}
bool runOnModule(Module &M) override;
void getAnalysisUsage(AnalysisUsage &AU) const override;
diff --git a/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp
index 1d38244..d17528d 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp
@@ -147,7 +147,7 @@ void visit(MachineFunction &MF, MachineBasicBlock &Start,
// Do a preorder traversal of the CFG starting from the given function's entry
// point. Calls |op| on each basic block encountered during the traversal.
void visit(MachineFunction &MF, std::function<void(MachineBasicBlock *)> op) {
- visit(MF, *MF.begin(), op);
+ visit(MF, *MF.begin(), std::move(op));
}
bool SPIRVPostLegalizer::runOnMachineFunction(MachineFunction &MF) {
diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
index f4b4846..b62db7f 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
@@ -99,6 +99,7 @@ addConstantsToTrack(MachineFunction &MF, SPIRVGlobalRegistry *GR,
SPIRVType *ExtType = GR->getOrCreateSPIRVType(
Const->getType(), MIB, SPIRV::AccessQualifier::ReadWrite,
true);
+ assert(SrcMI && "Expected source instruction to be valid");
SrcMI->setDesc(STI.getInstrInfo()->get(SPIRV::OpConstantNull));
SrcMI->addOperand(MachineOperand::CreateReg(
GR->getSPIRVTypeID(ExtType), false));
diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
index 595424b..74aec4f 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
@@ -234,7 +234,7 @@ static SmallVector<Metadata *> parseAnnotation(Value *I,
return SmallVector<Metadata *>{};
MDs.push_back(MDNode::get(Ctx, MDsItem));
}
- return Pos == static_cast<int>(Anno.length()) ? MDs
+ return Pos == static_cast<int>(Anno.length()) ? std::move(MDs)
: SmallVector<Metadata *>{};
}
diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
index d7cf211..e0bfb77 100644
--- a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
@@ -226,6 +226,7 @@ void SPIRVPassConfig::addIRPasses() {
}
void SPIRVPassConfig::addISelPrepare() {
+ addPass(createSPIRVLegalizeImplicitBindingPass());
addPass(createSPIRVEmitIntrinsicsPass(&getTM<SPIRVTargetMachine>()));
if (TM.getSubtargetImpl()->isLogicalSPIRV())
addPass(createSPIRVLegalizePointerCastPass(&getTM<SPIRVTargetMachine>()));
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
index 416d811..820e56b 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
@@ -463,8 +463,10 @@ std::string getOclOrSpirvBuiltinDemangledName(StringRef Name) {
DemangledNameLenStart = NameSpaceStart + 11;
}
Start = Name.find_first_not_of("0123456789", DemangledNameLenStart);
- Name.substr(DemangledNameLenStart, Start - DemangledNameLenStart)
- .getAsInteger(10, Len);
+ [[maybe_unused]] bool Error =
+ Name.substr(DemangledNameLenStart, Start - DemangledNameLenStart)
+ .getAsInteger(10, Len);
+ assert(!Error && "Failed to parse demangled name length");
return Name.substr(Start, Len).str();
}
@@ -756,7 +758,7 @@ bool getVacantFunctionName(Module &M, std::string &Name) {
for (unsigned I = 0; I < MaxIters; ++I) {
std::string OrdName = Name + Twine(I).str();
if (!M.getFunction(OrdName)) {
- Name = OrdName;
+ Name = std::move(OrdName);
return true;
}
}
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
index 1aa8efe..c0fc3a6 100644
--- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -1179,7 +1179,7 @@ static void fixupVariableFloatArgs(SmallVectorImpl<CCValAssign> &ArgLocs,
if (!VA.isRegLoc() || (ValTy != MVT::f64 && ValTy != MVT::f128))
continue;
// The fixed arguments to a varargs function still go in FP registers.
- if (Outs[VA.getValNo()].IsFixed)
+ if (!Outs[VA.getValNo()].Flags.isVarArg())
continue;
// This floating point argument should be reassigned.
diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.h b/llvm/lib/Target/SystemZ/SystemZCallingConv.h
index 25f4aac..fbb98ff 100644
--- a/llvm/lib/Target/SystemZ/SystemZCallingConv.h
+++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.h
@@ -31,10 +31,6 @@ namespace SystemZ {
class SystemZCCState : public CCState {
private:
- /// Records whether the value was a fixed argument.
- /// See ISD::OutputArg::IsFixed.
- SmallVector<bool, 4> ArgIsFixed;
-
/// Records whether the value was widened from a short vector type.
SmallVector<bool, 4> ArgIsShortVector;
@@ -50,10 +46,6 @@ public:
void AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
CCAssignFn Fn) {
- // Formal arguments are always fixed.
- ArgIsFixed.clear();
- for (unsigned i = 0; i < Ins.size(); ++i)
- ArgIsFixed.push_back(true);
// Record whether the call operand was a short vector.
ArgIsShortVector.clear();
for (unsigned i = 0; i < Ins.size(); ++i)
@@ -64,10 +56,6 @@ public:
void AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
CCAssignFn Fn) {
- // Record whether the call operand was a fixed argument.
- ArgIsFixed.clear();
- for (unsigned i = 0; i < Outs.size(); ++i)
- ArgIsFixed.push_back(Outs[i].IsFixed);
// Record whether the call operand was a short vector.
ArgIsShortVector.clear();
for (unsigned i = 0; i < Outs.size(); ++i)
@@ -77,12 +65,11 @@ public:
}
// This version of AnalyzeCallOperands in the base class is not usable
- // since we must provide a means of accessing ISD::OutputArg::IsFixed.
+ // since we must provide a means of accessing ISD::OutputArg::IsShortVector.
void AnalyzeCallOperands(const SmallVectorImpl<MVT> &Outs,
SmallVectorImpl<ISD::ArgFlagsTy> &Flags,
CCAssignFn Fn) = delete;
- bool IsFixed(unsigned ValNo) { return ArgIsFixed[ValNo]; }
bool IsShortVector(unsigned ValNo) { return ArgIsShortVector[ValNo]; }
};
diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
index 0ad872b..059f31f 100644
--- a/llvm/lib/Target/SystemZ/SystemZCallingConv.td
+++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
@@ -16,14 +16,6 @@ class CCIfSubtarget<string F, CCAction A>
"getSubtarget<SystemZSubtarget>().", F),
A>;
-// Match if this specific argument is a fixed (i.e. named) argument.
-class CCIfFixed<CCAction A>
- : CCIf<"static_cast<SystemZCCState *>(&State)->IsFixed(ValNo)", A>;
-
-// Match if this specific argument is not a fixed (i.e. vararg) argument.
-class CCIfNotFixed<CCAction A>
- : CCIf<"!(static_cast<SystemZCCState *>(&State)->IsFixed(ValNo))", A>;
-
// Match if this specific argument was widened from a short vector type.
class CCIfShortVector<CCAction A>
: CCIf<"static_cast<SystemZCCState *>(&State)->IsShortVector(ValNo)", A>;
@@ -79,7 +71,7 @@ def CC_SystemZ_GHC : CallingConv<[
// Pass in STG registers: XMM1, ..., XMM6
CCIfSubtarget<"hasVector()",
CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
- CCIfFixed<CCAssignToReg<[V16, V17, V18, V19, V20, V21]>>>>,
+ CCIfArgFixed<CCAssignToReg<[V16, V17, V18, V19, V20, V21]>>>>,
// Fail otherwise
CCCustom<"CC_SystemZ_GHC_Error">
@@ -125,8 +117,8 @@ def CC_SystemZ_ELF : CallingConv<[
// during type legalization.
CCIfSubtarget<"hasVector()",
CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
- CCIfFixed<CCAssignToReg<[V24, V26, V28, V30,
- V25, V27, V29, V31]>>>>,
+ CCIfArgFixed<CCAssignToReg<[V24, V26, V28, V30,
+ V25, V27, V29, V31]>>>>,
// However, sub-128 vectors which need to go on the stack occupy just a
// single 8-byte-aligned 8-byte stack slot. Pass as i64.
@@ -227,17 +219,17 @@ def CC_SystemZ_XPLINK64 : CallingConv<[
// Promote f32 to f64 and bitcast to i64, if it needs to be passed in GPRs.
// Although we assign the f32 vararg to be bitcast, it will first be promoted
// to an f64 within convertValVTToLocVT().
- CCIfType<[f32, f64], CCIfNotFixed<CCBitConvertToType<i64>>>,
+ CCIfType<[f32, f64], CCIfArgVarArg<CCBitConvertToType<i64>>>,
// Pointers are always passed in full 64-bit registers.
CCIfPtr<CCCustom<"CC_XPLINK64_Pointer">>,
// long double, can only be passed in GPR2 and GPR3, if available,
// hence R2Q
- CCIfType<[f128], CCIfNotFixed<CCCustom<"CC_XPLINK64_Allocate128BitVararg">>>,
+ CCIfType<[f128], CCIfArgVarArg<CCCustom<"CC_XPLINK64_Allocate128BitVararg">>>,
// Non fixed vector arguments are treated in the same way as long
// doubles.
CCIfSubtarget<"hasVector()",
CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
- CCIfNotFixed<CCCustom<"CC_XPLINK64_Allocate128BitVararg">>>>,
+ CCIfArgVarArg<CCCustom<"CC_XPLINK64_Allocate128BitVararg">>>>,
// A SwiftSelf is passed in callee-saved R10.
CCIfSwiftSelf<CCIfType<[i64], CCAssignToReg<[R10D]>>>,
@@ -260,22 +252,24 @@ def CC_SystemZ_XPLINK64 : CallingConv<[
// during type legalization.
CCIfSubtarget<"hasVector()",
CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
- CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>>,
+ CCIfArgFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>>,
CCIfSubtarget<"hasVector()",
CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
- CCIfFixed<CCAssignToRegAndStack<[V24, V25, V26, V27,
- V28, V29, V30, V31], 16, 8>>>>,
+ CCIfArgFixed<CCAssignToRegAndStack<[V24, V25, V26, V27,
+ V28, V29, V30, V31], 16, 8>>>>,
// The first 4 named float and double arguments are passed in registers
// FPR0-FPR6. The rest will be passed in the user area.
- CCIfType<[f32, f64], CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>,
- CCIfType<[f32], CCIfFixed<CCAssignToRegAndStack<[F0S, F2S, F4S, F6S], 4, 8>>>,
- CCIfType<[f64], CCIfFixed<CCAssignToRegAndStack<[F0D, F2D, F4D, F6D], 8, 8>>>,
+ CCIfType<[f32, f64], CCIfArgFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>,
+ CCIfType<[f32],
+ CCIfArgFixed<CCAssignToRegAndStack<[F0S, F2S, F4S, F6S], 4, 8>>>,
+ CCIfType<[f64],
+ CCIfArgFixed<CCAssignToRegAndStack<[F0D, F2D, F4D, F6D], 8, 8>>>,
// The first 2 long double arguments are passed in register FPR0/FPR2
// and FPR4/FPR6. The rest will be passed in the user area.
- CCIfType<[f128], CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>,
- CCIfType<[f128], CCIfFixed<CCAssignToRegAndStack<[F0Q, F4Q], 16, 8>>>,
+ CCIfType<[f128], CCIfArgFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>,
+ CCIfType<[f128], CCIfArgFixed<CCAssignToRegAndStack<[F0Q, F4Q], 16, 8>>>,
// Other arguments are passed in 8-byte-aligned 8-byte stack slots.
CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
index 6297916..5ee66e3 100644
--- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -574,13 +574,11 @@ void SystemZELFFrameLowering::emitPrologue(MachineFunction &MF,
// Call mcount (Regmask from CC AnyReg since mcount preserves all normal
// argument registers).
- FunctionCallee FC = MF.getFunction().getParent()->getOrInsertFunction(
- "mcount", Type::getVoidTy(MF.getFunction().getContext()));
const uint32_t *Mask = MF.getSubtarget<SystemZSubtarget>()
.getSpecialRegisters()
->getCallPreservedMask(MF, CallingConv::AnyReg);
BuildMI(MBB, MBBI, DL, ZII->get(SystemZ::CallBRASL))
- .addGlobalAddress(dyn_cast<Function>(FC.getCallee()))
+ .addExternalSymbol("mcount")
.addRegMask(Mask);
// Reload return address from 8 bytes above stack pointer.
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 3f80b2a..f9eba4b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -1309,7 +1309,7 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
OutVal = FINode;
}
// Count the number of fixed args *after* legalization.
- NumFixedArgs += Out.IsFixed;
+ NumFixedArgs += !Out.Flags.isVarArg();
}
bool IsVarArg = CLI.IsVarArg;
@@ -1503,7 +1503,7 @@ SDValue WebAssemblyTargetLowering::LowerReturn(
for (const ISD::OutputArg &Out : Outs) {
assert(!Out.Flags.isByVal() && "byval is not valid for return values");
assert(!Out.Flags.isNest() && "nest is not valid for return values");
- assert(Out.IsFixed && "non-fixed return value is not valid");
+ assert(!Out.Flags.isVarArg() && "non-fixed return value is not valid");
if (Out.Flags.isInAlloca())
fail(DL, DAG, "WebAssembly hasn't implemented inalloca results");
if (Out.Flags.isInConsecutiveRegs())
diff --git a/llvm/lib/Target/X86/GISel/X86CallLowering.cpp b/llvm/lib/Target/X86/GISel/X86CallLowering.cpp
index c0a6035..d9f4405 100644
--- a/llvm/lib/Target/X86/GISel/X86CallLowering.cpp
+++ b/llvm/lib/Target/X86/GISel/X86CallLowering.cpp
@@ -75,7 +75,7 @@ public:
static const MCPhysReg XMMArgRegs[] = {X86::XMM0, X86::XMM1, X86::XMM2,
X86::XMM3, X86::XMM4, X86::XMM5,
X86::XMM6, X86::XMM7};
- if (!Info.IsFixed)
+ if (Flags.isVarArg())
NumXMMRegs = State.getFirstUnallocated(XMMArgRegs);
return Res;
@@ -363,7 +363,8 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
Info.CallConv, Info.IsVarArg))
return false;
- bool IsFixed = Info.OrigArgs.empty() ? true : Info.OrigArgs.back().IsFixed;
+ bool IsFixed =
+ Info.OrigArgs.empty() ? true : !Info.OrigArgs.back().Flags[0].isVarArg();
if (STI.is64Bit() && !IsFixed && !STI.isCallingConvWin64(Info.CallConv)) {
// From AMD64 ABI document:
// For calls that may call functions that use varargs or stdargs
diff --git a/llvm/lib/Target/Xtensa/Disassembler/XtensaDisassembler.cpp b/llvm/lib/Target/Xtensa/Disassembler/XtensaDisassembler.cpp
index 2f92f86..39bec47 100644
--- a/llvm/lib/Target/Xtensa/Disassembler/XtensaDisassembler.cpp
+++ b/llvm/lib/Target/Xtensa/Disassembler/XtensaDisassembler.cpp
@@ -145,39 +145,40 @@ struct DecodeRegister {
};
const DecodeRegister SRDecoderTable[] = {
- {Xtensa::LBEG, 0}, {Xtensa::LEND, 1},
- {Xtensa::LCOUNT, 2}, {Xtensa::SAR, 3},
- {Xtensa::BREG, 4}, {Xtensa::LITBASE, 5},
- {Xtensa::ACCLO, 16}, {Xtensa::ACCHI, 17},
- {Xtensa::M0, 32}, {Xtensa::M1, 33},
- {Xtensa::M2, 34}, {Xtensa::M3, 35},
- {Xtensa::WINDOWBASE, 72}, {Xtensa::WINDOWSTART, 73},
- {Xtensa::IBREAKENABLE, 96}, {Xtensa::MEMCTL, 97},
- {Xtensa::DDR, 104}, {Xtensa::IBREAKA0, 128},
- {Xtensa::IBREAKA1, 129}, {Xtensa::DBREAKA0, 144},
- {Xtensa::DBREAKA1, 145}, {Xtensa::DBREAKC0, 160},
- {Xtensa::DBREAKC1, 161}, {Xtensa::CONFIGID0, 176},
- {Xtensa::EPC1, 177}, {Xtensa::EPC2, 178},
- {Xtensa::EPC3, 179}, {Xtensa::EPC4, 180},
- {Xtensa::EPC5, 181}, {Xtensa::EPC6, 182},
- {Xtensa::EPC7, 183}, {Xtensa::DEPC, 192},
- {Xtensa::EPS2, 194}, {Xtensa::EPS3, 195},
- {Xtensa::EPS4, 196}, {Xtensa::EPS5, 197},
- {Xtensa::EPS6, 198}, {Xtensa::EPS7, 199},
- {Xtensa::CONFIGID1, 208}, {Xtensa::EXCSAVE1, 209},
- {Xtensa::EXCSAVE2, 210}, {Xtensa::EXCSAVE3, 211},
- {Xtensa::EXCSAVE4, 212}, {Xtensa::EXCSAVE5, 213},
- {Xtensa::EXCSAVE6, 214}, {Xtensa::EXCSAVE7, 215},
- {Xtensa::CPENABLE, 224}, {Xtensa::INTERRUPT, 226},
- {Xtensa::INTCLEAR, 227}, {Xtensa::INTENABLE, 228},
- {Xtensa::PS, 230}, {Xtensa::VECBASE, 231},
- {Xtensa::EXCCAUSE, 232}, {Xtensa::DEBUGCAUSE, 233},
- {Xtensa::CCOUNT, 234}, {Xtensa::PRID, 235},
- {Xtensa::ICOUNT, 236}, {Xtensa::ICOUNTLEVEL, 237},
- {Xtensa::EXCVADDR, 238}, {Xtensa::CCOMPARE0, 240},
- {Xtensa::CCOMPARE1, 241}, {Xtensa::CCOMPARE2, 242},
- {Xtensa::MISC0, 244}, {Xtensa::MISC1, 245},
- {Xtensa::MISC2, 246}, {Xtensa::MISC3, 247}};
+ {Xtensa::LBEG, 0}, {Xtensa::LEND, 1},
+ {Xtensa::LCOUNT, 2}, {Xtensa::SAR, 3},
+ {Xtensa::BREG, 4}, {Xtensa::LITBASE, 5},
+ {Xtensa::SCOMPARE1, 12}, {Xtensa::ACCLO, 16},
+ {Xtensa::ACCHI, 17}, {Xtensa::M0, 32},
+ {Xtensa::M1, 33}, {Xtensa::M2, 34},
+ {Xtensa::M3, 35}, {Xtensa::WINDOWBASE, 72},
+ {Xtensa::WINDOWSTART, 73}, {Xtensa::IBREAKENABLE, 96},
+ {Xtensa::MEMCTL, 97}, {Xtensa::ATOMCTL, 99},
+ {Xtensa::DDR, 104}, {Xtensa::IBREAKA0, 128},
+ {Xtensa::IBREAKA1, 129}, {Xtensa::DBREAKA0, 144},
+ {Xtensa::DBREAKA1, 145}, {Xtensa::DBREAKC0, 160},
+ {Xtensa::DBREAKC1, 161}, {Xtensa::CONFIGID0, 176},
+ {Xtensa::EPC1, 177}, {Xtensa::EPC2, 178},
+ {Xtensa::EPC3, 179}, {Xtensa::EPC4, 180},
+ {Xtensa::EPC5, 181}, {Xtensa::EPC6, 182},
+ {Xtensa::EPC7, 183}, {Xtensa::DEPC, 192},
+ {Xtensa::EPS2, 194}, {Xtensa::EPS3, 195},
+ {Xtensa::EPS4, 196}, {Xtensa::EPS5, 197},
+ {Xtensa::EPS6, 198}, {Xtensa::EPS7, 199},
+ {Xtensa::CONFIGID1, 208}, {Xtensa::EXCSAVE1, 209},
+ {Xtensa::EXCSAVE2, 210}, {Xtensa::EXCSAVE3, 211},
+ {Xtensa::EXCSAVE4, 212}, {Xtensa::EXCSAVE5, 213},
+ {Xtensa::EXCSAVE6, 214}, {Xtensa::EXCSAVE7, 215},
+ {Xtensa::CPENABLE, 224}, {Xtensa::INTERRUPT, 226},
+ {Xtensa::INTCLEAR, 227}, {Xtensa::INTENABLE, 228},
+ {Xtensa::PS, 230}, {Xtensa::VECBASE, 231},
+ {Xtensa::EXCCAUSE, 232}, {Xtensa::DEBUGCAUSE, 233},
+ {Xtensa::CCOUNT, 234}, {Xtensa::PRID, 235},
+ {Xtensa::ICOUNT, 236}, {Xtensa::ICOUNTLEVEL, 237},
+ {Xtensa::EXCVADDR, 238}, {Xtensa::CCOMPARE0, 240},
+ {Xtensa::CCOMPARE1, 241}, {Xtensa::CCOMPARE2, 242},
+ {Xtensa::MISC0, 244}, {Xtensa::MISC1, 245},
+ {Xtensa::MISC2, 246}, {Xtensa::MISC3, 247}};
static DecodeStatus DecodeSRRegisterClass(MCInst &Inst, uint64_t RegNo,
uint64_t Address,
diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp
index 821cba0..080a9c0 100644
--- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp
+++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp
@@ -200,6 +200,9 @@ bool Xtensa::checkRegister(MCRegister RegNo, const FeatureBitset &FeatureBits,
case Xtensa::WINDOWBASE:
case Xtensa::WINDOWSTART:
return FeatureBits[Xtensa::FeatureWindowed];
+ case Xtensa::ATOMCTL:
+ case Xtensa::SCOMPARE1:
+ return FeatureBits[Xtensa::FeatureWindowed];
case Xtensa::NoRegister:
return false;
}
diff --git a/llvm/lib/Target/Xtensa/XtensaFeatures.td b/llvm/lib/Target/Xtensa/XtensaFeatures.td
index 97d5472..d6f3ef0 100644
--- a/llvm/lib/Target/Xtensa/XtensaFeatures.td
+++ b/llvm/lib/Target/Xtensa/XtensaFeatures.td
@@ -73,6 +73,22 @@ def FeatureDiv32 : SubtargetFeature<"div32", "HasDiv32", "true",
def HasDiv32 : Predicate<"Subtarget->hasDiv32()">,
AssemblerPredicate<(all_of FeatureDiv32)>;
+def FeatureS32C1I : SubtargetFeature<"s32c1i", "HasS32C1I", "true",
+ "Enable Xtensa S32C1I option">;
+def HasS32C1I : Predicate<"Subtarget->hasS32C1I()">,
+ AssemblerPredicate<(all_of FeatureS32C1I)>;
+
+// Assume that lock-free native-width atomics are available, even if the target
+// and operating system combination would not usually provide them. The user
+// is responsible for providing any necessary __sync implementations. Code
+// built with this feature is not ABI-compatible with code built without this
+// feature, if atomic variables are exposed across the ABI boundary.
+def FeatureForcedAtomics : SubtargetFeature<"forced-atomics", "HasForcedAtomics", "true",
+ "Assume that lock-free native-width atomics are available">;
+def HasForcedAtomics : Predicate<"Subtarget->hasForcedAtomics()">,
+ AssemblerPredicate<(all_of FeatureForcedAtomics)>;
+def HasAtomicLdSt : Predicate<"Subtarget->hasS32C1I() || Subtarget->hasForcedAtomics()">;
+
def FeatureRegionProtection : SubtargetFeature<"regprotect", "HasRegionProtection", "true",
"Enable Xtensa Region Protection option">;
def HasRegionProtection : Predicate<"Subtarget->hasRegionProtection()">,
diff --git a/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp b/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp
index fd42fd2..6a07bd8 100644
--- a/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp
+++ b/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp
@@ -250,6 +250,15 @@ XtensaTargetLowering::XtensaTargetLowering(const TargetMachine &TM,
// Floating-point truncation and stores need to be done separately.
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+ if (Subtarget.hasS32C1I()) {
+ setMaxAtomicSizeInBitsSupported(32);
+ setMinCmpXchgSizeInBits(32);
+ } else if (Subtarget.hasForcedAtomics()) {
+ setMaxAtomicSizeInBitsSupported(32);
+ } else {
+ setMaxAtomicSizeInBitsSupported(0);
+ }
+
// Compute derived properties from the register classes
computeRegisterProperties(STI.getRegisterInfo());
}
@@ -1548,6 +1557,11 @@ const char *XtensaTargetLowering::getTargetNodeName(unsigned Opcode) const {
return nullptr;
}
+TargetLowering::AtomicExpansionKind
+XtensaTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
+ return AtomicExpansionKind::CmpXChg;
+}
+
//===----------------------------------------------------------------------===//
// Custom insertion
//===----------------------------------------------------------------------===//
@@ -1696,6 +1710,23 @@ MachineBasicBlock *XtensaTargetLowering::EmitInstrWithCustomInserter(
return MBB;
}
+ case Xtensa::ATOMIC_CMP_SWAP_32_P: {
+ MachineOperand &R = MI.getOperand(0);
+ MachineOperand &Addr = MI.getOperand(1);
+ MachineOperand &Cmp = MI.getOperand(2);
+ MachineOperand &Swap = MI.getOperand(3);
+
+ BuildMI(*MBB, MI, DL, TII.get(Xtensa::WSR), Xtensa::SCOMPARE1)
+ .addReg(Cmp.getReg());
+
+ BuildMI(*MBB, MI, DL, TII.get(Xtensa::S32C1I), R.getReg())
+ .addReg(Swap.getReg())
+ .addReg(Addr.getReg())
+ .addImm(0);
+
+ MI.eraseFromParent();
+ return MBB;
+ }
default:
llvm_unreachable("Unexpected instr type to insert");
}
diff --git a/llvm/lib/Target/Xtensa/XtensaISelLowering.h b/llvm/lib/Target/Xtensa/XtensaISelLowering.h
index e6ddf98..d84cbdb 100644
--- a/llvm/lib/Target/Xtensa/XtensaISelLowering.h
+++ b/llvm/lib/Target/Xtensa/XtensaISelLowering.h
@@ -145,6 +145,12 @@ public:
const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
SelectionDAG &DAG) const override;
+ bool shouldInsertFencesForAtomic(const Instruction *I) const override {
+ return true;
+ }
+
+ AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override;
+
bool decomposeMulByConstant(LLVMContext &Context, EVT VT,
SDValue C) const override;
diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.td b/llvm/lib/Target/Xtensa/XtensaInstrInfo.td
index 31608f4..edcf247 100644
--- a/llvm/lib/Target/Xtensa/XtensaInstrInfo.td
+++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.td
@@ -496,6 +496,8 @@ def EXTW : RRR_Inst<0x00, 0x00, 0x00, (outs), (ins),
let hasSideEffects = 1;
}
+def : Pat<(atomic_fence timm, timm), (MEMW)>;
+
//===----------------------------------------------------------------------===//
// Illegal instructions
//===----------------------------------------------------------------------===//
@@ -1499,6 +1501,46 @@ def RFI : RRR_Inst<0x00, 0x00, 0x00, (outs), (ins uimm4:$imm),
}
//===----------------------------------------------------------------------===//
+// S32C1I
+//===----------------------------------------------------------------------===//
+
+let mayStore = 1, mayLoad = 1, Predicates = [HasS32C1I] in {
+ def S32C1I : RRI8_Inst<0x02, (outs AR:$a), (ins AR:$t, mem32:$addr),
+ "s32c1i\t$t, $addr", []> {
+ bits<12> addr;
+
+ let r = 0x0e;
+ let Uses = [SCOMPARE1];
+ let Constraints = "$a = $t";
+ let imm8{7-0} = addr{11-4};
+ let s{3-0} = addr{3-0};
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// Atomic patterns
+//===----------------------------------------------------------------------===//
+
+// Atomic load/store are available under both +s32c1i and +force-atomics.
+// Fences will be inserted for atomic load/stores according to the logic in
+// XtensaTargetLowering.
+let Predicates = [HasAtomicLdSt] in {
+ def : Pat<(i32 (atomic_load_8 addr_ish1:$addr)), (L8UI addr_ish1:$addr)>;
+ def : Pat<(i32 (atomic_load_16 addr_ish2:$addr)), (L16UI addr_ish2:$addr)>;
+ def : Pat<(i32 (atomic_load_32 addr_ish4:$addr)), (L32I addr_ish4:$addr)>;
+
+ def : Pat<(atomic_store_8 AR:$t, addr_ish1:$addr), (S8I AR:$t, addr_ish1:$addr)>;
+ def : Pat<(atomic_store_16 AR:$t, addr_ish2:$addr), (S16I AR:$t, addr_ish2:$addr)>;
+ def : Pat<(atomic_store_32 AR:$t, addr_ish4:$addr), (S32I AR:$t, addr_ish4:$addr)>;
+}
+
+let usesCustomInserter = 1, Predicates = [HasS32C1I] in {
+ def ATOMIC_CMP_SWAP_32_P : Pseudo<(outs AR:$dst), (ins AR:$ptr, AR:$cmp, AR:$swap),
+ "!atomic_cmp_swap_32_p, $dst, $ptr, $cmp, $swap",
+ [(set AR:$dst, (atomic_cmp_swap_i32 AR:$ptr, AR:$cmp, AR:$swap))]>;
+}
+
+//===----------------------------------------------------------------------===//
// DSP Instructions
//===----------------------------------------------------------------------===//
include "XtensaDSPInstrInfo.td"
diff --git a/llvm/lib/Target/Xtensa/XtensaRegisterInfo.td b/llvm/lib/Target/Xtensa/XtensaRegisterInfo.td
index 596c410..d1f2c6b 100644
--- a/llvm/lib/Target/Xtensa/XtensaRegisterInfo.td
+++ b/llvm/lib/Target/Xtensa/XtensaRegisterInfo.td
@@ -84,6 +84,9 @@ def SAR : SRReg<3, "sar", ["SAR","3"]>;
// Boolean Register
def BREG : SRReg<4, "br", ["BR","4"]>;
+// Expected data value for S32C1I operation
+def SCOMPARE1 : SRReg<12, "scompare1", ["SCOMPARE1", "12"]>;
+
// Literal base
def LITBASE : SRReg<5, "litbase", ["LITBASE", "5"]>;
@@ -97,6 +100,9 @@ def IBREAKENABLE : SRReg<96, "ibreakenable", ["IBREAKENABLE", "96"]>;
// Memory Control Register
def MEMCTL : SRReg<97, "memctl", ["MEMCTL", "97"]>;
+// Atomic Operation Control
+def ATOMCTL : SRReg<99, "atomctl", ["ATOMCTL", "99"]>;
+
def DDR : SRReg<104, "ddr", ["DDR", "104"]>;
// Instuction break address register 0
@@ -218,8 +224,8 @@ def MR23 : RegisterClass<"Xtensa", [i32], 32, (add M2, M3)>;
def MR : RegisterClass<"Xtensa", [i32], 32, (add MR01, MR23)>;
def SR : RegisterClass<"Xtensa", [i32], 32, (add
- LBEG, LEND, LCOUNT, SAR, BREG, LITBASE, ACCLO, ACCHI, MR,
- WINDOWBASE, WINDOWSTART, IBREAKENABLE, MEMCTL, DDR, IBREAKA0, IBREAKA1,
+ LBEG, LEND, LCOUNT, SAR, BREG, SCOMPARE1, LITBASE, ACCLO, ACCHI, MR,
+ WINDOWBASE, WINDOWSTART, IBREAKENABLE, MEMCTL, ATOMCTL, DDR, IBREAKA0, IBREAKA1,
DBREAKA0, DBREAKA1, DBREAKC0, DBREAKC1, CONFIGID0, EPC1, EPC2, EPC3, EPC4, EPC5,
EPC6, EPC7, DEPC, EPS2, EPS3, EPS4, EPS5, EPS6, EPS7, CONFIGID1, EXCSAVE1, EXCSAVE2,
EXCSAVE3, EXCSAVE4, EXCSAVE5, EXCSAVE6, EXCSAVE7, CPENABLE, INTERRUPT, INTSET, INTCLEAR, INTENABLE,
diff --git a/llvm/lib/Target/Xtensa/XtensaSubtarget.h b/llvm/lib/Target/Xtensa/XtensaSubtarget.h
index fd677a4..b406534 100644
--- a/llvm/lib/Target/Xtensa/XtensaSubtarget.h
+++ b/llvm/lib/Target/Xtensa/XtensaSubtarget.h
@@ -77,6 +77,8 @@ public:
bool hasMul32() const { return HasMul32; }
bool hasMul32High() const { return HasMul32High; }
bool hasDiv32() const { return HasDiv32; }
+ bool hasS32C1I() const { return HasS32C1I; }
+ bool hasForcedAtomics() const { return HasForcedAtomics; }
bool hasSingleFloat() const { return HasSingleFloat; }
bool hasRegionProtection() const { return HasRegionProtection; }
bool hasRelocatableVector() const { return HasRelocatableVector; }
diff --git a/llvm/lib/Target/Xtensa/XtensaTargetMachine.cpp b/llvm/lib/Target/Xtensa/XtensaTargetMachine.cpp
index 8d2dca6..c9f1ca8 100644
--- a/llvm/lib/Target/Xtensa/XtensaTargetMachine.cpp
+++ b/llvm/lib/Target/Xtensa/XtensaTargetMachine.cpp
@@ -107,6 +107,7 @@ public:
}
bool addInstSelector() override;
+ void addIRPasses() override;
void addPreEmitPass() override;
};
} // end anonymous namespace
@@ -116,6 +117,11 @@ bool XtensaPassConfig::addInstSelector() {
return false;
}
+void XtensaPassConfig::addIRPasses() {
+ addPass(createAtomicExpandLegacyPass());
+ TargetPassConfig::addIRPasses();
+}
+
void XtensaPassConfig::addPreEmitPass() { addPass(&BranchRelaxationPassID); }
TargetPassConfig *XtensaTargetMachine::createPassConfig(PassManagerBase &PM) {