aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Analysis/ConstantFolding.cpp3
-rw-r--r--llvm/lib/Analysis/Delinearization.cpp200
-rw-r--r--llvm/lib/Analysis/DependenceAnalysis.cpp510
-rw-r--r--llvm/lib/Analysis/MemoryDependenceAnalysis.cpp10
-rw-r--r--llvm/lib/Analysis/MemoryLocation.cpp2
-rw-r--r--llvm/lib/Analysis/StackLifetime.cpp2
-rw-r--r--llvm/lib/Analysis/ValueTracking.cpp3
-rw-r--r--llvm/lib/Analysis/VectorUtils.cpp6
-rw-r--r--llvm/lib/AsmParser/LLLexer.cpp1
-rw-r--r--llvm/lib/AsmParser/LLParser.cpp2
-rw-r--r--llvm/lib/BinaryFormat/DXContainer.cpp11
-rw-r--r--llvm/lib/Bitcode/Reader/BitcodeReader.cpp1
-rw-r--r--llvm/lib/Bitcode/Writer/BitcodeWriter.cpp1
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp1
-rw-r--r--llvm/lib/CodeGen/AtomicExpandPass.cpp13
-rw-r--r--llvm/lib/CodeGen/GlobalISel/CallLowering.cpp9
-rw-r--r--llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp2
-rw-r--r--llvm/lib/CodeGen/RegisterPressure.cpp4
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp45
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp57
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp25
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h1
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp18
-rw-r--r--llvm/lib/CodeGen/TargetLoweringBase.cpp5
-rw-r--r--llvm/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp22
-rw-r--r--llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp9
-rw-r--r--llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp11
-rw-r--r--llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp2
-rw-r--r--llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp445
-rw-r--r--llvm/lib/IR/AutoUpgrade.cpp26
-rw-r--r--llvm/lib/IR/ConstantFold.cpp1
-rw-r--r--llvm/lib/IR/ConstantRange.cpp1
-rw-r--r--llvm/lib/IR/Constants.cpp20
-rw-r--r--llvm/lib/IR/DebugInfoMetadata.cpp10
-rw-r--r--llvm/lib/IR/Globals.cpp1
-rw-r--r--llvm/lib/IR/IRBuilder.cpp20
-rw-r--r--llvm/lib/IR/Instruction.cpp1
-rw-r--r--llvm/lib/IR/Instructions.cpp52
-rw-r--r--llvm/lib/IR/LLVMContextImpl.h27
-rw-r--r--llvm/lib/IR/Value.cpp46
-rw-r--r--llvm/lib/IR/Verifier.cpp33
-rw-r--r--llvm/lib/MC/MCObjectStreamer.cpp2
-rw-r--r--llvm/lib/MC/MCSection.cpp2
-rw-r--r--llvm/lib/ProfileData/InstrProfWriter.cpp63
-rw-r--r--llvm/lib/SandboxIR/Context.cpp1
-rw-r--r--llvm/lib/SandboxIR/Instruction.cpp3
-rw-r--r--llvm/lib/Support/StringRef.cpp6
-rw-r--r--llvm/lib/Support/regcomp.c2
-rw-r--r--llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp10
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp204
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrFormats.td10
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.td4
-rw-r--r--llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp64
-rw-r--r--llvm/lib/Target/AArch64/AArch64Processors.td11
-rw-r--r--llvm/lib/Target/AArch64/AArch64StackTagging.cpp3
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp99
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h8
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp85
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp12
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp38
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp13
-rw-r--r--llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp37
-rw-r--r--llvm/lib/Target/AMDGPU/GCNRegPressure.cpp56
-rw-r--r--llvm/lib/Target/AMDGPU/GCNRegPressure.h43
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp145
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.h9
-rw-r--r--llvm/lib/Target/AMDGPU/SIFoldOperands.cpp10
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp27
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp67
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h13
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h15
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp14
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h2
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp140
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.h3
-rw-r--r--llvm/lib/Target/AVR/AVRISelLowering.cpp7
-rw-r--r--llvm/lib/Target/DirectX/DXILForwardHandleAccesses.cpp37
-rw-r--r--llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp2
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp15
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchISelLowering.h2
-rw-r--r--llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp3
-rw-r--r--llvm/lib/Target/Mips/MipsCCState.cpp5
-rw-r--r--llvm/lib/Target/Mips/MipsCCState.h8
-rw-r--r--llvm/lib/Target/Mips/MipsCallLowering.cpp2
-rw-r--r--llvm/lib/Target/Mips/MipsCallingConv.td8
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp150
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelLowering.h6
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXInstrInfo.td44
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXIntrinsics.td62
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp8
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h7
-rw-r--r--llvm/lib/Target/PowerPC/PPCCCState.h30
-rw-r--r--llvm/lib/Target/PowerPC/PPCISelLowering.cpp13
-rw-r--r--llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp17
-rw-r--r--llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp6
-rw-r--r--llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.h5
-rw-r--r--llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp6
-rw-r--r--llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp10
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp32
-rw-r--r--llvm/lib/Target/RISCV/RISCVCallingConv.cpp14
-rw-r--r--llvm/lib/Target/RISCV/RISCVCallingConv.h6
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp170
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoZb.td22
-rw-r--r--llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td200
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h3
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp29
-rw-r--r--llvm/lib/Target/Sparc/SparcISelLowering.cpp2
-rw-r--r--llvm/lib/Target/SystemZ/SystemZCallingConv.h15
-rw-r--r--llvm/lib/Target/SystemZ/SystemZCallingConv.td38
-rw-r--r--llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp3
-rw-r--r--llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h3
-rw-r--r--llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp14
-rw-r--r--llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h3
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp9
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp4
-rw-r--r--llvm/lib/Target/X86/GISel/X86CallLowering.cpp5
-rw-r--r--llvm/lib/Transforms/Coroutines/CoroFrame.cpp2
-rw-r--r--llvm/lib/Transforms/IPO/ExpandVariadics.cpp21
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp48
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp2
-rw-r--r--llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp20
-rw-r--r--llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp16
-rw-r--r--llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp2
-rw-r--r--llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp20
-rw-r--r--llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp38
-rw-r--r--llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp2
-rw-r--r--llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp4
-rw-r--r--llvm/lib/Transforms/Scalar/InferAlignment.cpp49
-rw-r--r--llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp3
-rw-r--r--llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp6
-rw-r--r--llvm/lib/Transforms/Scalar/NewGVN.cpp2
-rw-r--r--llvm/lib/Transforms/Scalar/SROA.cpp27
-rw-r--r--llvm/lib/Transforms/Utils/CodeExtractor.cpp6
-rw-r--r--llvm/lib/Transforms/Utils/InlineFunction.cpp170
-rw-r--r--llvm/lib/Transforms/Utils/Local.cpp2
-rw-r--r--llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp2
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp110
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp164
-rw-r--r--llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp1
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.cpp23
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.h10
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp91
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.h13
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp2
148 files changed, 3177 insertions, 1606 deletions
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index dd98b62..c14cb9e 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -1485,6 +1485,9 @@ Constant *llvm::ConstantFoldCastOperand(unsigned Opcode, Constant *C,
switch (Opcode) {
default:
llvm_unreachable("Missing case");
+ case Instruction::PtrToAddr:
+ // TODO: Add some of the ptrtoint folds here as well.
+ break;
case Instruction::PtrToInt:
if (auto *CE = dyn_cast<ConstantExpr>(C)) {
Constant *FoldedValue = nullptr;
diff --git a/llvm/lib/Analysis/Delinearization.cpp b/llvm/lib/Analysis/Delinearization.cpp
index 329bd35..761c566 100644
--- a/llvm/lib/Analysis/Delinearization.cpp
+++ b/llvm/lib/Analysis/Delinearization.cpp
@@ -24,6 +24,7 @@
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/PassManager.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
@@ -32,6 +33,11 @@ using namespace llvm;
#define DL_NAME "delinearize"
#define DEBUG_TYPE DL_NAME
+static cl::opt<bool> UseFixedSizeArrayHeuristic(
+ "delinearize-use-fixed-size-array-heuristic", cl::init(false), cl::Hidden,
+ cl::desc("When printing analysis, use the heuristic for fixed-size arrays "
+ "if the default delinearizetion fails."));
+
// Return true when S contains at least an undef value.
static inline bool containsUndefs(const SCEV *S) {
return SCEVExprContains(S, [](const SCEV *S) {
@@ -480,6 +486,184 @@ void llvm::delinearize(ScalarEvolution &SE, const SCEV *Expr,
});
}
+static std::optional<APInt> tryIntoAPInt(const SCEV *S) {
+ if (const auto *Const = dyn_cast<SCEVConstant>(S))
+ return Const->getAPInt();
+ return std::nullopt;
+}
+
+/// Collects the absolute values of constant steps for all induction variables.
+/// Returns true if we can prove that all step recurrences are constants and \p
+/// Expr is divisible by \p ElementSize. Each step recurrence is stored in \p
+/// Steps after divided by \p ElementSize.
+static bool collectConstantAbsSteps(ScalarEvolution &SE, const SCEV *Expr,
+ SmallVectorImpl<uint64_t> &Steps,
+ uint64_t ElementSize) {
+ // End of recursion. The constant value also must be a multiple of
+ // ElementSize.
+ if (const auto *Const = dyn_cast<SCEVConstant>(Expr)) {
+ const uint64_t Mod = Const->getAPInt().urem(ElementSize);
+ return Mod == 0;
+ }
+
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Expr);
+ if (!AR || !AR->isAffine())
+ return false;
+
+ const SCEV *Step = AR->getStepRecurrence(SE);
+ std::optional<APInt> StepAPInt = tryIntoAPInt(Step);
+ if (!StepAPInt)
+ return false;
+
+ APInt Q;
+ uint64_t R;
+ APInt::udivrem(StepAPInt->abs(), ElementSize, Q, R);
+ if (R != 0)
+ return false;
+
+ // Bail out when the step is too large.
+ std::optional<uint64_t> StepVal = Q.tryZExtValue();
+ if (!StepVal)
+ return false;
+
+ Steps.push_back(*StepVal);
+ return collectConstantAbsSteps(SE, AR->getStart(), Steps, ElementSize);
+}
+
+bool llvm::findFixedSizeArrayDimensions(ScalarEvolution &SE, const SCEV *Expr,
+ SmallVectorImpl<uint64_t> &Sizes,
+ const SCEV *ElementSize) {
+ if (!ElementSize)
+ return false;
+
+ std::optional<APInt> ElementSizeAPInt = tryIntoAPInt(ElementSize);
+ if (!ElementSizeAPInt || *ElementSizeAPInt == 0)
+ return false;
+
+ std::optional<uint64_t> ElementSizeConst = ElementSizeAPInt->tryZExtValue();
+
+ // Early exit when ElementSize is not a positive constant.
+ if (!ElementSizeConst)
+ return false;
+
+ if (!collectConstantAbsSteps(SE, Expr, Sizes, *ElementSizeConst) ||
+ Sizes.empty()) {
+ Sizes.clear();
+ return false;
+ }
+
+ // At this point, Sizes contains the absolute step recurrences for all
+ // induction variables. Each step recurrence must be a multiple of the size of
+ // the array element. Assuming that the each value represents the size of an
+ // array for each dimension, attempts to restore the length of each dimension
+ // by dividing the step recurrence by the next smaller value. For example, if
+ // we have the following AddRec SCEV:
+ //
+ // AddRec: {{{0,+,2048}<%for.i>,+,256}<%for.j>,+,8}<%for.k> (ElementSize=8)
+ //
+ // Then Sizes will become [256, 32, 1] after sorted. We don't know the size of
+ // the outermost dimension, the next dimension will be computed as 256 / 32 =
+ // 8, and the last dimension will be computed as 32 / 1 = 32. Thus it results
+ // in like Arr[UnknownSize][8][32] with elements of size 8 bytes, where Arr is
+ // a base pointer.
+ //
+ // TODO: Catch more cases, e.g., when a step recurrence is not divisible by
+ // the next smaller one, like A[i][3*j].
+ llvm::sort(Sizes.rbegin(), Sizes.rend());
+ Sizes.erase(llvm::unique(Sizes), Sizes.end());
+
+ // The last element in Sizes should be ElementSize. At this point, all values
+ // in Sizes are assumed to be divided by ElementSize, so replace it with 1.
+ assert(Sizes.back() != 0 && "Unexpected zero size in Sizes.");
+ Sizes.back() = 1;
+
+ for (unsigned I = 0; I + 1 < Sizes.size(); I++) {
+ uint64_t PrevSize = Sizes[I + 1];
+ if (Sizes[I] % PrevSize) {
+ Sizes.clear();
+ return false;
+ }
+ Sizes[I] /= PrevSize;
+ }
+
+ // Finally, the last element in Sizes should be ElementSize.
+ Sizes.back() = *ElementSizeConst;
+ return true;
+}
+
+/// Splits the SCEV into two vectors of SCEVs representing the subscripts and
+/// sizes of an array access, assuming that the array is a fixed size array.
+///
+/// E.g., if we have the code like as follows:
+///
+/// double A[42][8][32];
+/// for i
+/// for j
+/// for k
+/// use A[i][j][k]
+///
+/// The access function will be represented as an AddRec SCEV like:
+///
+/// AddRec: {{{0,+,2048}<%for.i>,+,256}<%for.j>,+,8}<%for.k> (ElementSize=8)
+///
+/// Then findFixedSizeArrayDimensions infers the size of each dimension of the
+/// array based on the fact that the value of the step recurrence is a multiple
+/// of the size of the corresponding array element. In the above example, it
+/// results in the following:
+///
+/// CHECK: ArrayDecl[UnknownSize][8][32] with elements of 8 bytes.
+///
+/// Finally each subscript will be computed as follows:
+///
+/// CHECK: ArrayRef[{0,+,1}<%for.i>][{0,+,1}<%for.j>][{0,+,1}<%for.k>]
+///
+/// Note that this function doesn't check the range of possible values for each
+/// subscript, so the caller should perform additional boundary checks if
+/// necessary.
+///
+/// Also note that this function doesn't guarantee that the original array size
+/// is restored "correctly". For example, in the following case:
+///
+/// double A[42][4][64];
+/// double B[42][8][32];
+/// for i
+/// for j
+/// for k
+/// use A[i][j][k]
+/// use B[i][2*j][k]
+///
+/// The access function for both accesses will be the same:
+///
+/// AddRec: {{{0,+,2048}<%for.i>,+,512}<%for.j>,+,8}<%for.k> (ElementSize=8)
+///
+/// The array sizes for both A and B will be computed as
+/// ArrayDecl[UnknownSize][4][64], which matches for A, but not for B.
+///
+/// TODO: At the moment, this function can handle only simple cases. For
+/// example, we cannot handle a case where a step recurrence is not divisible
+/// by the next smaller step recurrence, e.g., A[i][3*j].
+bool llvm::delinearizeFixedSizeArray(ScalarEvolution &SE, const SCEV *Expr,
+ SmallVectorImpl<const SCEV *> &Subscripts,
+ SmallVectorImpl<const SCEV *> &Sizes,
+ const SCEV *ElementSize) {
+
+ // First step: find the fixed array size.
+ SmallVector<uint64_t, 4> ConstSizes;
+ if (!findFixedSizeArrayDimensions(SE, Expr, ConstSizes, ElementSize)) {
+ Sizes.clear();
+ return false;
+ }
+
+ // Convert the constant size to SCEV.
+ for (uint64_t Size : ConstSizes)
+ Sizes.push_back(SE.getConstant(Expr->getType(), Size));
+
+ // Second step: compute the access functions for each subscript.
+ computeAccessFunctions(SE, Expr, Subscripts, Sizes);
+
+ return !Subscripts.empty();
+}
+
bool llvm::getIndexExpressionsFromGEP(ScalarEvolution &SE,
const GetElementPtrInst *GEP,
SmallVectorImpl<const SCEV *> &Subscripts,
@@ -586,9 +770,21 @@ void printDelinearization(raw_ostream &O, Function *F, LoopInfo *LI,
O << "AccessFunction: " << *AccessFn << "\n";
SmallVector<const SCEV *, 3> Subscripts, Sizes;
+
+ auto IsDelinearizationFailed = [&]() {
+ return Subscripts.size() == 0 || Sizes.size() == 0 ||
+ Subscripts.size() != Sizes.size();
+ };
+
delinearize(*SE, AccessFn, Subscripts, Sizes, SE->getElementSize(&Inst));
- if (Subscripts.size() == 0 || Sizes.size() == 0 ||
- Subscripts.size() != Sizes.size()) {
+ if (UseFixedSizeArrayHeuristic && IsDelinearizationFailed()) {
+ Subscripts.clear();
+ Sizes.clear();
+ delinearizeFixedSizeArray(*SE, AccessFn, Subscripts, Sizes,
+ SE->getElementSize(&Inst));
+ }
+
+ if (IsDelinearizationFailed()) {
O << "failed to delinearize\n";
continue;
}
diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp
index f1473b2..835e270 100644
--- a/llvm/lib/Analysis/DependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/DependenceAnalysis.cpp
@@ -180,8 +180,8 @@ static void dumpExampleDependence(raw_ostream &OS, DependenceInfo *DA,
for (inst_iterator SrcI = inst_begin(F), SrcE = inst_end(F); SrcI != SrcE;
++SrcI) {
if (SrcI->mayReadOrWriteMemory()) {
- for (inst_iterator DstI = SrcI, DstE = inst_end(F);
- DstI != DstE; ++DstI) {
+ for (inst_iterator DstI = SrcI, DstE = inst_end(F); DstI != DstE;
+ ++DstI) {
if (DstI->mayReadOrWriteMemory()) {
OS << "Src:" << *SrcI << " --> Dst:" << *DstI << "\n";
OS << " da analyze - ";
@@ -203,7 +203,7 @@ static void dumpExampleDependence(raw_ostream &OS, DependenceInfo *DA,
// Normalize negative direction vectors if required by clients.
if (NormalizeResults && D->normalize(&SE))
- OS << "normalized - ";
+ OS << "normalized - ";
D->dump(OS);
for (unsigned Level = 1; Level <= D->getLevels(); Level++) {
if (D->isSplitable(Level)) {
@@ -227,8 +227,8 @@ static void dumpExampleDependence(raw_ostream &OS, DependenceInfo *DA,
void DependenceAnalysisWrapperPass::print(raw_ostream &OS,
const Module *) const {
- dumpExampleDependence(OS, info.get(),
- getAnalysis<ScalarEvolutionWrapperPass>().getSE(), false);
+ dumpExampleDependence(
+ OS, info.get(), getAnalysis<ScalarEvolutionWrapperPass>().getSE(), false);
}
PreservedAnalyses
@@ -249,33 +249,26 @@ bool Dependence::isInput() const {
return Src->mayReadFromMemory() && Dst->mayReadFromMemory();
}
-
// Returns true if this is an output dependence.
bool Dependence::isOutput() const {
return Src->mayWriteToMemory() && Dst->mayWriteToMemory();
}
-
// Returns true if this is an flow (aka true) dependence.
bool Dependence::isFlow() const {
return Src->mayWriteToMemory() && Dst->mayReadFromMemory();
}
-
// Returns true if this is an anti dependence.
bool Dependence::isAnti() const {
return Src->mayReadFromMemory() && Dst->mayWriteToMemory();
}
-
// Returns true if a particular level is scalar; that is,
// if no subscript in the source or destination mention the induction
// variable associated with the loop at this level.
// Leave this out of line, so it will serve as a virtual method anchor
-bool Dependence::isScalar(unsigned level) const {
- return false;
-}
-
+bool Dependence::isScalar(unsigned level) const { return false; }
//===----------------------------------------------------------------------===//
// FullDependence methods
@@ -338,8 +331,7 @@ bool FullDependence::normalize(ScalarEvolution *SE) {
DV[Level - 1].Direction = RevDirection;
// Reverse the dependence distance as well.
if (DV[Level - 1].Distance != nullptr)
- DV[Level - 1].Distance =
- SE->getNegativeSCEV(DV[Level - 1].Distance);
+ DV[Level - 1].Distance = SE->getNegativeSCEV(DV[Level - 1].Distance);
}
LLVM_DEBUG(dbgs() << "After normalizing negative direction vectors:\n";
@@ -355,14 +347,12 @@ unsigned FullDependence::getDirection(unsigned Level) const {
return DV[Level - 1].Direction;
}
-
// Returns the distance (or NULL) associated with a particular level.
const SCEV *FullDependence::getDistance(unsigned Level) const {
assert(0 < Level && Level <= Levels && "Level out of range");
return DV[Level - 1].Distance;
}
-
// Returns true if a particular level is scalar; that is,
// if no subscript in the source or destination mention the induction
// variable associated with the loop at this level.
@@ -371,7 +361,6 @@ bool FullDependence::isScalar(unsigned Level) const {
return DV[Level - 1].Scalar;
}
-
// Returns true if peeling the first iteration from this loop
// will break this dependence.
bool FullDependence::isPeelFirst(unsigned Level) const {
@@ -379,7 +368,6 @@ bool FullDependence::isPeelFirst(unsigned Level) const {
return DV[Level - 1].PeelFirst;
}
-
// Returns true if peeling the last iteration from this loop
// will break this dependence.
bool FullDependence::isPeelLast(unsigned Level) const {
@@ -387,14 +375,12 @@ bool FullDependence::isPeelLast(unsigned Level) const {
return DV[Level - 1].PeelLast;
}
-
// Returns true if splitting this loop will break the dependence.
bool FullDependence::isSplitable(unsigned Level) const {
assert(0 < Level && Level <= Levels && "Level out of range");
return DV[Level - 1].Splitable;
}
-
//===----------------------------------------------------------------------===//
// DependenceInfo::Constraint methods
@@ -405,7 +391,6 @@ const SCEV *DependenceInfo::Constraint::getX() const {
return A;
}
-
// If constraint is a point <X, Y>, returns Y.
// Otherwise assert.
const SCEV *DependenceInfo::Constraint::getY() const {
@@ -413,7 +398,6 @@ const SCEV *DependenceInfo::Constraint::getY() const {
return B;
}
-
// If constraint is a line AX + BY = C, returns A.
// Otherwise assert.
const SCEV *DependenceInfo::Constraint::getA() const {
@@ -422,7 +406,6 @@ const SCEV *DependenceInfo::Constraint::getA() const {
return A;
}
-
// If constraint is a line AX + BY = C, returns B.
// Otherwise assert.
const SCEV *DependenceInfo::Constraint::getB() const {
@@ -431,7 +414,6 @@ const SCEV *DependenceInfo::Constraint::getB() const {
return B;
}
-
// If constraint is a line AX + BY = C, returns C.
// Otherwise assert.
const SCEV *DependenceInfo::Constraint::getC() const {
@@ -440,7 +422,6 @@ const SCEV *DependenceInfo::Constraint::getC() const {
return C;
}
-
// If constraint is a distance, returns D.
// Otherwise assert.
const SCEV *DependenceInfo::Constraint::getD() const {
@@ -448,7 +429,6 @@ const SCEV *DependenceInfo::Constraint::getD() const {
return SE->getNegativeSCEV(C);
}
-
// Returns the loop associated with this constraint.
const Loop *DependenceInfo::Constraint::getAssociatedLoop() const {
assert((Kind == Distance || Kind == Line || Kind == Point) &&
@@ -499,17 +479,16 @@ LLVM_DUMP_METHOD void DependenceInfo::Constraint::dump(raw_ostream &OS) const {
else if (isPoint())
OS << " Point is <" << *getX() << ", " << *getY() << ">\n";
else if (isDistance())
- OS << " Distance is " << *getD() <<
- " (" << *getA() << "*X + " << *getB() << "*Y = " << *getC() << ")\n";
+ OS << " Distance is " << *getD() << " (" << *getA() << "*X + " << *getB()
+ << "*Y = " << *getC() << ")\n";
else if (isLine())
- OS << " Line is " << *getA() << "*X + " <<
- *getB() << "*Y = " << *getC() << "\n";
+ OS << " Line is " << *getA() << "*X + " << *getB() << "*Y = " << *getC()
+ << "\n";
else
llvm_unreachable("unknown constraint type in Constraint::dump");
}
#endif
-
// Updates X with the intersection
// of the Constraints X and Y. Returns true if X has changed.
// Corresponds to Figure 4 from the paper
@@ -591,15 +570,14 @@ bool DependenceInfo::intersectConstraints(Constraint *X, const Constraint *Y) {
const SCEV *A1B2 = SE->getMulExpr(X->getA(), Y->getB());
const SCEV *A2B1 = SE->getMulExpr(Y->getA(), X->getB());
const SCEVConstant *C1A2_C2A1 =
- dyn_cast<SCEVConstant>(SE->getMinusSCEV(C1A2, C2A1));
+ dyn_cast<SCEVConstant>(SE->getMinusSCEV(C1A2, C2A1));
const SCEVConstant *C1B2_C2B1 =
- dyn_cast<SCEVConstant>(SE->getMinusSCEV(C1B2, C2B1));
+ dyn_cast<SCEVConstant>(SE->getMinusSCEV(C1B2, C2B1));
const SCEVConstant *A1B2_A2B1 =
- dyn_cast<SCEVConstant>(SE->getMinusSCEV(A1B2, A2B1));
+ dyn_cast<SCEVConstant>(SE->getMinusSCEV(A1B2, A2B1));
const SCEVConstant *A2B1_A1B2 =
- dyn_cast<SCEVConstant>(SE->getMinusSCEV(A2B1, A1B2));
- if (!C1B2_C2B1 || !C1A2_C2A1 ||
- !A1B2_A2B1 || !A2B1_A1B2)
+ dyn_cast<SCEVConstant>(SE->getMinusSCEV(A2B1, A1B2));
+ if (!C1B2_C2B1 || !C1A2_C2A1 || !A1B2_A2B1 || !A2B1_A1B2)
return false;
APInt Xtop = C1B2_C2B1->getAPInt();
APInt Xbot = A1B2_A2B1->getAPInt();
@@ -626,8 +604,8 @@ bool DependenceInfo::intersectConstraints(Constraint *X, const Constraint *Y) {
++DeltaSuccesses;
return true;
}
- if (const SCEVConstant *CUB =
- collectConstantUpperBound(X->getAssociatedLoop(), Prod1->getType())) {
+ if (const SCEVConstant *CUB = collectConstantUpperBound(
+ X->getAssociatedLoop(), Prod1->getType())) {
const APInt &UpperBound = CUB->getAPInt();
LLVM_DEBUG(dbgs() << "\t\tupper bound = " << UpperBound << "\n");
if (Xq.sgt(UpperBound) || Yq.sgt(UpperBound)) {
@@ -636,8 +614,7 @@ bool DependenceInfo::intersectConstraints(Constraint *X, const Constraint *Y) {
return true;
}
}
- X->setPoint(SE->getConstant(Xq),
- SE->getConstant(Yq),
+ X->setPoint(SE->getConstant(Xq), SE->getConstant(Yq),
X->getAssociatedLoop());
++DeltaSuccesses;
return true;
@@ -667,7 +644,6 @@ bool DependenceInfo::intersectConstraints(Constraint *X, const Constraint *Y) {
return false;
}
-
//===----------------------------------------------------------------------===//
// DependenceInfo methods
@@ -737,8 +713,7 @@ void Dependence::dump(raw_ostream &OS) const {
// tbaa, non-overlapping regions etc), then it is known there is no dependecy.
// Otherwise the underlying objects are checked to see if they point to
// different identifiable objects.
-static AliasResult underlyingObjectsAlias(AAResults *AA,
- const DataLayout &DL,
+static AliasResult underlyingObjectsAlias(AAResults *AA, const DataLayout &DL,
const MemoryLocation &LocA,
const MemoryLocation &LocB) {
// Check the original locations (minus size) for noalias, which can happen for
@@ -773,8 +748,7 @@ static AliasResult underlyingObjectsAlias(AAResults *AA,
// Returns true if the load or store can be analyzed. Atomic and volatile
// operations have properties which this analysis does not understand.
-static
-bool isLoadOrStore(const Instruction *I) {
+static bool isLoadOrStore(const Instruction *I) {
if (const LoadInst *LI = dyn_cast<LoadInst>(I))
return LI->isUnordered();
else if (const StoreInst *SI = dyn_cast<StoreInst>(I))
@@ -782,7 +756,6 @@ bool isLoadOrStore(const Instruction *I) {
return false;
}
-
// Examines the loop nesting of the Src and Dst
// instructions and establishes their shared loops. Sets the variables
// CommonLevels, SrcLevels, and MaxLevels.
@@ -860,14 +833,12 @@ void DependenceInfo::establishNestingLevels(const Instruction *Src,
MaxLevels -= CommonLevels;
}
-
// Given one of the loops containing the source, return
// its level index in our numbering scheme.
unsigned DependenceInfo::mapSrcLoop(const Loop *SrcLoop) const {
return SrcLoop->getLoopDepth();
}
-
// Given one of the loops containing the destination,
// return its level index in our numbering scheme.
unsigned DependenceInfo::mapDstLoop(const Loop *DstLoop) const {
@@ -880,7 +851,6 @@ unsigned DependenceInfo::mapDstLoop(const Loop *DstLoop) const {
return D;
}
-
// Returns true if Expression is loop invariant in LoopNest.
bool DependenceInfo::isLoopInvariant(const SCEV *Expression,
const Loop *LoopNest) const {
@@ -896,8 +866,6 @@ bool DependenceInfo::isLoopInvariant(const SCEV *Expression,
return SE->isLoopInvariant(Expression, LoopNest->getOutermostLoop());
}
-
-
// Finds the set of loops from the LoopNest that
// have a level <= CommonLevels and are referred to by the SCEV Expression.
void DependenceInfo::collectCommonLoops(const SCEV *Expression,
@@ -924,9 +892,9 @@ void DependenceInfo::unifySubscriptType(ArrayRef<Subscript *> Pairs) {
IntegerType *SrcTy = dyn_cast<IntegerType>(Src->getType());
IntegerType *DstTy = dyn_cast<IntegerType>(Dst->getType());
if (SrcTy == nullptr || DstTy == nullptr) {
- assert(SrcTy == DstTy && "This function only unify integer types and "
- "expect Src and Dst share the same type "
- "otherwise.");
+ assert(SrcTy == DstTy &&
+ "This function only unify integer types and "
+ "expect Src and Dst share the same type otherwise.");
continue;
}
if (SrcTy->getBitWidth() > widestWidthSeen) {
@@ -939,7 +907,6 @@ void DependenceInfo::unifySubscriptType(ArrayRef<Subscript *> Pairs) {
}
}
-
assert(widestWidthSeen > 0);
// Now extend each pair to the widest seen.
@@ -949,9 +916,9 @@ void DependenceInfo::unifySubscriptType(ArrayRef<Subscript *> Pairs) {
IntegerType *SrcTy = dyn_cast<IntegerType>(Src->getType());
IntegerType *DstTy = dyn_cast<IntegerType>(Dst->getType());
if (SrcTy == nullptr || DstTy == nullptr) {
- assert(SrcTy == DstTy && "This function only unify integer types and "
- "expect Src and Dst share the same type "
- "otherwise.");
+ assert(SrcTy == DstTy &&
+ "This function only unify integer types and "
+ "expect Src and Dst share the same type otherwise.");
continue;
}
if (SrcTy->getBitWidth() < widestWidthSeen)
@@ -1028,7 +995,6 @@ bool DependenceInfo::checkDstSubscript(const SCEV *Dst, const Loop *LoopNest,
return checkSubscript(Dst, LoopNest, Loops, false);
}
-
// Examines the subscript pair (the Src and Dst SCEVs)
// and classifies it as either ZIV, SIV, RDIV, MIV, or Nonlinear.
// Collects the associated loops in a set.
@@ -1049,14 +1015,12 @@ DependenceInfo::classifyPair(const SCEV *Src, const Loop *SrcLoopNest,
return Subscript::ZIV;
if (N == 1)
return Subscript::SIV;
- if (N == 2 && (SrcLoops.count() == 0 ||
- DstLoops.count() == 0 ||
+ if (N == 2 && (SrcLoops.count() == 0 || DstLoops.count() == 0 ||
(SrcLoops.count() == 1 && DstLoops.count() == 1)))
return Subscript::RDIV;
return Subscript::MIV;
}
-
// A wrapper around SCEV::isKnownPredicate.
// Looks for cases where we're interested in comparing for equality.
// If both X and Y have been identically sign or zero extended,
@@ -1069,12 +1033,9 @@ DependenceInfo::classifyPair(const SCEV *Src, const Loop *SrcLoopNest,
// involving symbolics.
bool DependenceInfo::isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *X,
const SCEV *Y) const {
- if (Pred == CmpInst::ICMP_EQ ||
- Pred == CmpInst::ICMP_NE) {
- if ((isa<SCEVSignExtendExpr>(X) &&
- isa<SCEVSignExtendExpr>(Y)) ||
- (isa<SCEVZeroExtendExpr>(X) &&
- isa<SCEVZeroExtendExpr>(Y))) {
+ if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) {
+ if ((isa<SCEVSignExtendExpr>(X) && isa<SCEVSignExtendExpr>(Y)) ||
+ (isa<SCEVZeroExtendExpr>(X) && isa<SCEVZeroExtendExpr>(Y))) {
const SCEVIntegralCastExpr *CX = cast<SCEVIntegralCastExpr>(X);
const SCEVIntegralCastExpr *CY = cast<SCEVIntegralCastExpr>(Y);
const SCEV *Xop = CX->getOperand();
@@ -1111,7 +1072,10 @@ bool DependenceInfo::isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *X,
}
}
-/// Compare to see if S is less than Size, using isKnownNegative(S - max(Size, 1))
+/// Compare to see if S is less than Size, using
+///
+/// isKnownNegative(S - Size)
+///
/// with some extra checking if S is an AddRec and we can prove less-than using
/// the loop bounds.
bool DependenceInfo::isKnownLessThan(const SCEV *S, const SCEV *Size) const {
@@ -1126,21 +1090,34 @@ bool DependenceInfo::isKnownLessThan(const SCEV *S, const SCEV *Size) const {
Size = SE->getTruncateOrZeroExtend(Size, MaxType);
// Special check for addrecs using BE taken count
- const SCEV *Bound = SE->getMinusSCEV(S, Size);
- if (const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Bound)) {
- if (AddRec->isAffine()) {
+ if (const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(S))
+ if (AddRec->isAffine() && AddRec->hasNoSignedWrap()) {
const SCEV *BECount = SE->getBackedgeTakenCount(AddRec->getLoop());
- if (!isa<SCEVCouldNotCompute>(BECount)) {
- const SCEV *Limit = AddRec->evaluateAtIteration(BECount, *SE);
- if (SE->isKnownNegative(Limit))
- return true;
- }
+ const SCEV *Start = AddRec->getStart();
+ const SCEV *Step = AddRec->getStepRecurrence(*SE);
+ const SCEV *End = AddRec->evaluateAtIteration(BECount, *SE);
+ const SCEV *Diff0 = SE->getMinusSCEV(Start, Size);
+ const SCEV *Diff1 = SE->getMinusSCEV(End, Size);
+
+ // If the value of Step is non-negative and the AddRec is non-wrap, it
+ // reaches its maximum at the last iteration. So it's enouth to check
+ // whether End - Size is negative.
+ if (SE->isKnownNonNegative(Step) && SE->isKnownNegative(Diff1))
+ return true;
+
+ // If the value of Step is non-positive and the AddRec is non-wrap, the
+ // initial value is its maximum.
+ if (SE->isKnownNonPositive(Step) && SE->isKnownNegative(Diff0))
+ return true;
+
+ // Even if we don't know the sign of Step, either Start or End must be
+ // the maximum value of the AddRec since it is non-wrap.
+ if (SE->isKnownNegative(Diff0) && SE->isKnownNegative(Diff1))
+ return true;
}
- }
// Check using normal isKnownNegative
- const SCEV *LimitedBound =
- SE->getMinusSCEV(S, SE->getSMaxExpr(Size, SE->getOne(Size->getType())));
+ const SCEV *LimitedBound = SE->getMinusSCEV(S, Size);
return SE->isKnownNegative(LimitedBound);
}
@@ -1178,7 +1155,6 @@ const SCEV *DependenceInfo::collectUpperBound(const Loop *L, Type *T) const {
return nullptr;
}
-
// Calls collectUpperBound(), then attempts to cast it to SCEVConstant.
// If the cast fails, returns NULL.
const SCEVConstant *DependenceInfo::collectConstantUpperBound(const Loop *L,
@@ -1188,7 +1164,6 @@ const SCEVConstant *DependenceInfo::collectConstantUpperBound(const Loop *L,
return nullptr;
}
-
// testZIV -
// When we have a pair of subscripts of the form [c1] and [c2],
// where c1 and c2 are both loop invariant, we attack it using
@@ -1218,7 +1193,6 @@ bool DependenceInfo::testZIV(const SCEV *Src, const SCEV *Dst,
return false; // possibly dependent
}
-
// strongSIVtest -
// From the paper, Practical Dependence Testing, Section 4.2.1
//
@@ -1270,9 +1244,9 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst,
LLVM_DEBUG(dbgs() << "\t UpperBound = " << *UpperBound);
LLVM_DEBUG(dbgs() << ", " << *UpperBound->getType() << "\n");
const SCEV *AbsDelta =
- SE->isKnownNonNegative(Delta) ? Delta : SE->getNegativeSCEV(Delta);
+ SE->isKnownNonNegative(Delta) ? Delta : SE->getNegativeSCEV(Delta);
const SCEV *AbsCoeff =
- SE->isKnownNonNegative(Coeff) ? Coeff : SE->getNegativeSCEV(Coeff);
+ SE->isKnownNonNegative(Coeff) ? Coeff : SE->getNegativeSCEV(Coeff);
const SCEV *Product = SE->getMulExpr(UpperBound, AbsCoeff);
if (isKnownPredicate(CmpInst::ICMP_SGT, AbsDelta, Product)) {
// Distance greater than trip count - no dependence
@@ -1286,7 +1260,7 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst,
if (isa<SCEVConstant>(Delta) && isa<SCEVConstant>(Coeff)) {
APInt ConstDelta = cast<SCEVConstant>(Delta)->getAPInt();
APInt ConstCoeff = cast<SCEVConstant>(Coeff)->getAPInt();
- APInt Distance = ConstDelta; // these need to be initialized
+ APInt Distance = ConstDelta; // these need to be initialized
APInt Remainder = ConstDelta;
APInt::sdivrem(ConstDelta, ConstCoeff, Distance, Remainder);
LLVM_DEBUG(dbgs() << "\t Distance = " << Distance << "\n");
@@ -1307,29 +1281,25 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst,
else
Result.DV[Level].Direction &= Dependence::DVEntry::EQ;
++StrongSIVsuccesses;
- }
- else if (Delta->isZero()) {
+ } else if (Delta->isZero()) {
// since 0/X == 0
Result.DV[Level].Distance = Delta;
NewConstraint.setDistance(Delta, CurLoop);
Result.DV[Level].Direction &= Dependence::DVEntry::EQ;
++StrongSIVsuccesses;
- }
- else {
+ } else {
if (Coeff->isOne()) {
LLVM_DEBUG(dbgs() << "\t Distance = " << *Delta << "\n");
Result.DV[Level].Distance = Delta; // since X/1 == X
NewConstraint.setDistance(Delta, CurLoop);
- }
- else {
+ } else {
Result.Consistent = false;
- NewConstraint.setLine(Coeff,
- SE->getNegativeSCEV(Coeff),
+ NewConstraint.setLine(Coeff, SE->getNegativeSCEV(Coeff),
SE->getNegativeSCEV(Delta), CurLoop);
}
// maybe we can get a useful direction
- bool DeltaMaybeZero = !SE->isKnownNonZero(Delta);
+ bool DeltaMaybeZero = !SE->isKnownNonZero(Delta);
bool DeltaMaybePositive = !SE->isKnownNonPositive(Delta);
bool DeltaMaybeNegative = !SE->isKnownNonNegative(Delta);
bool CoeffMaybePositive = !SE->isKnownNonPositive(Coeff);
@@ -1353,7 +1323,6 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst,
return false;
}
-
// weakCrossingSIVtest -
// From the paper, Practical Dependence Testing, Section 4.2.2
//
@@ -1447,8 +1416,8 @@ bool DependenceInfo::weakCrossingSIVtest(
if (const SCEV *UpperBound = collectUpperBound(CurLoop, Delta->getType())) {
LLVM_DEBUG(dbgs() << "\t UpperBound = " << *UpperBound << "\n");
const SCEV *ConstantTwo = SE->getConstant(UpperBound->getType(), 2);
- const SCEV *ML = SE->getMulExpr(SE->getMulExpr(ConstCoeff, UpperBound),
- ConstantTwo);
+ const SCEV *ML =
+ SE->getMulExpr(SE->getMulExpr(ConstCoeff, UpperBound), ConstantTwo);
LLVM_DEBUG(dbgs() << "\t ML = " << *ML << "\n");
if (isKnownPredicate(CmpInst::ICMP_SGT, Delta, ML)) {
// Delta too big, no dependence
@@ -1498,7 +1467,6 @@ bool DependenceInfo::weakCrossingSIVtest(
return false;
}
-
// Kirch's algorithm, from
//
// Optimizing Supercompilers for Supercomputers
@@ -1519,9 +1487,11 @@ static bool findGCD(unsigned Bits, const APInt &AM, const APInt &BM,
APInt R = G0;
APInt::sdivrem(G0, G1, Q, R);
while (R != 0) {
+ // clang-format off
APInt A2 = A0 - Q*A1; A0 = A1; A1 = A2;
APInt B2 = B0 - Q*B1; B0 = B1; B1 = B2;
G0 = G1; G1 = R;
+ // clang-format on
APInt::sdivrem(G0, G1, Q, R);
}
G = G1;
@@ -1543,8 +1513,7 @@ static APInt floorOfQuotient(const APInt &A, const APInt &B) {
APInt::sdivrem(A, B, Q, R);
if (R == 0)
return Q;
- if ((A.sgt(0) && B.sgt(0)) ||
- (A.slt(0) && B.slt(0)))
+ if ((A.sgt(0) && B.sgt(0)) || (A.slt(0) && B.slt(0)))
return Q;
else
return Q - 1;
@@ -1556,8 +1525,7 @@ static APInt ceilingOfQuotient(const APInt &A, const APInt &B) {
APInt::sdivrem(A, B, Q, R);
if (R == 0)
return Q;
- if ((A.sgt(0) && B.sgt(0)) ||
- (A.slt(0) && B.slt(0)))
+ if ((A.sgt(0) && B.sgt(0)) || (A.slt(0) && B.slt(0)))
return Q + 1;
else
return Q;
@@ -1733,17 +1701,14 @@ bool DependenceInfo::exactSIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
return Result.DV[Level].Direction == Dependence::DVEntry::NONE;
}
-
// Return true if the divisor evenly divides the dividend.
-static
-bool isRemainderZero(const SCEVConstant *Dividend,
- const SCEVConstant *Divisor) {
+static bool isRemainderZero(const SCEVConstant *Dividend,
+ const SCEVConstant *Divisor) {
const APInt &ConstDividend = Dividend->getAPInt();
const APInt &ConstDivisor = Divisor->getAPInt();
return ConstDividend.srem(ConstDivisor) == 0;
}
-
// weakZeroSrcSIVtest -
// From the paper, Practical Dependence Testing, Section 4.2.2
//
@@ -1807,11 +1772,11 @@ bool DependenceInfo::weakZeroSrcSIVtest(const SCEV *DstCoeff,
const SCEVConstant *ConstCoeff = dyn_cast<SCEVConstant>(DstCoeff);
if (!ConstCoeff)
return false;
- const SCEV *AbsCoeff =
- SE->isKnownNegative(ConstCoeff) ?
- SE->getNegativeSCEV(ConstCoeff) : ConstCoeff;
+ const SCEV *AbsCoeff = SE->isKnownNegative(ConstCoeff)
+ ? SE->getNegativeSCEV(ConstCoeff)
+ : ConstCoeff;
const SCEV *NewDelta =
- SE->isKnownNegative(ConstCoeff) ? SE->getNegativeSCEV(Delta) : Delta;
+ SE->isKnownNegative(ConstCoeff) ? SE->getNegativeSCEV(Delta) : Delta;
// check that Delta/SrcCoeff < iteration count
// really check NewDelta < count*AbsCoeff
@@ -1853,7 +1818,6 @@ bool DependenceInfo::weakZeroSrcSIVtest(const SCEV *DstCoeff,
return false;
}
-
// weakZeroDstSIVtest -
// From the paper, Practical Dependence Testing, Section 4.2.2
//
@@ -1916,11 +1880,11 @@ bool DependenceInfo::weakZeroDstSIVtest(const SCEV *SrcCoeff,
const SCEVConstant *ConstCoeff = dyn_cast<SCEVConstant>(SrcCoeff);
if (!ConstCoeff)
return false;
- const SCEV *AbsCoeff =
- SE->isKnownNegative(ConstCoeff) ?
- SE->getNegativeSCEV(ConstCoeff) : ConstCoeff;
+ const SCEV *AbsCoeff = SE->isKnownNegative(ConstCoeff)
+ ? SE->getNegativeSCEV(ConstCoeff)
+ : ConstCoeff;
const SCEV *NewDelta =
- SE->isKnownNegative(ConstCoeff) ? SE->getNegativeSCEV(Delta) : Delta;
+ SE->isKnownNegative(ConstCoeff) ? SE->getNegativeSCEV(Delta) : Delta;
// check that Delta/SrcCoeff < iteration count
// really check NewDelta < count*AbsCoeff
@@ -1962,7 +1926,6 @@ bool DependenceInfo::weakZeroDstSIVtest(const SCEV *SrcCoeff,
return false;
}
-
// exactRDIVtest - Tests the RDIV subscript pair for dependence.
// Things of the form [c1 + a*i] and [c2 + b*j],
// where i and j are induction variable, c1 and c2 are loop invariant,
@@ -2084,7 +2047,6 @@ bool DependenceInfo::exactRDIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
return TL.sgt(TU);
}
-
// symbolicRDIVtest -
// In Section 4.5 of the Practical Dependence Testing paper,the authors
// introduce a special case of Banerjee's Inequalities (also called the
@@ -2167,8 +2129,7 @@ bool DependenceInfo::symbolicRDIVtest(const SCEV *A1, const SCEV *A2,
return true;
}
}
- }
- else if (SE->isKnownNonPositive(A2)) {
+ } else if (SE->isKnownNonPositive(A2)) {
// a1 >= 0 && a2 <= 0
if (N1 && N2) {
// make sure that c2 - c1 <= a1*N1 - a2*N2
@@ -2187,8 +2148,7 @@ bool DependenceInfo::symbolicRDIVtest(const SCEV *A1, const SCEV *A2,
return true;
}
}
- }
- else if (SE->isKnownNonPositive(A1)) {
+ } else if (SE->isKnownNonPositive(A1)) {
if (SE->isKnownNonNegative(A2)) {
// a1 <= 0 && a2 >= 0
if (N1 && N2) {
@@ -2207,8 +2167,7 @@ bool DependenceInfo::symbolicRDIVtest(const SCEV *A1, const SCEV *A2,
++SymbolicRDIVindependence;
return true;
}
- }
- else if (SE->isKnownNonPositive(A2)) {
+ } else if (SE->isKnownNonPositive(A2)) {
// a1 <= 0 && a2 <= 0
if (N1) {
// make sure that a1*N1 <= c2 - c1
@@ -2233,7 +2192,6 @@ bool DependenceInfo::symbolicRDIVtest(const SCEV *A1, const SCEV *A2,
return false;
}
-
// testSIV -
// When we have a pair of subscripts of the form [c1 + a1*i] and [c2 - a2*i]
// where i is an induction variable, c1 and c2 are loop invariant, and a1 and
@@ -2260,17 +2218,17 @@ bool DependenceInfo::testSIV(const SCEV *Src, const SCEV *Dst, unsigned &Level,
Level = mapSrcLoop(CurLoop);
bool disproven;
if (SrcCoeff == DstCoeff)
- disproven = strongSIVtest(SrcCoeff, SrcConst, DstConst, CurLoop,
- Level, Result, NewConstraint);
+ disproven = strongSIVtest(SrcCoeff, SrcConst, DstConst, CurLoop, Level,
+ Result, NewConstraint);
else if (SrcCoeff == SE->getNegativeSCEV(DstCoeff))
disproven = weakCrossingSIVtest(SrcCoeff, SrcConst, DstConst, CurLoop,
Level, Result, NewConstraint, SplitIter);
else
disproven = exactSIVtest(SrcCoeff, DstCoeff, SrcConst, DstConst, CurLoop,
Level, Result, NewConstraint);
- return disproven ||
- gcdMIVtest(Src, Dst, Result) ||
- symbolicRDIVtest(SrcCoeff, DstCoeff, SrcConst, DstConst, CurLoop, CurLoop);
+ return disproven || gcdMIVtest(Src, Dst, Result) ||
+ symbolicRDIVtest(SrcCoeff, DstCoeff, SrcConst, DstConst, CurLoop,
+ CurLoop);
}
if (SrcAddRec) {
const SCEV *SrcConst = SrcAddRec->getStart();
@@ -2278,9 +2236,9 @@ bool DependenceInfo::testSIV(const SCEV *Src, const SCEV *Dst, unsigned &Level,
const SCEV *DstConst = Dst;
const Loop *CurLoop = SrcAddRec->getLoop();
Level = mapSrcLoop(CurLoop);
- return weakZeroDstSIVtest(SrcCoeff, SrcConst, DstConst, CurLoop,
- Level, Result, NewConstraint) ||
- gcdMIVtest(Src, Dst, Result);
+ return weakZeroDstSIVtest(SrcCoeff, SrcConst, DstConst, CurLoop, Level,
+ Result, NewConstraint) ||
+ gcdMIVtest(Src, Dst, Result);
}
if (DstAddRec) {
const SCEV *DstConst = DstAddRec->getStart();
@@ -2288,15 +2246,14 @@ bool DependenceInfo::testSIV(const SCEV *Src, const SCEV *Dst, unsigned &Level,
const SCEV *SrcConst = Src;
const Loop *CurLoop = DstAddRec->getLoop();
Level = mapDstLoop(CurLoop);
- return weakZeroSrcSIVtest(DstCoeff, SrcConst, DstConst,
- CurLoop, Level, Result, NewConstraint) ||
- gcdMIVtest(Src, Dst, Result);
+ return weakZeroSrcSIVtest(DstCoeff, SrcConst, DstConst, CurLoop, Level,
+ Result, NewConstraint) ||
+ gcdMIVtest(Src, Dst, Result);
}
llvm_unreachable("SIV test expected at least one AddRec");
return false;
}
-
// testRDIV -
// When we have a pair of subscripts of the form [c1 + a1*i] and [c2 + a2*j]
// where i and j are induction variables, c1 and c2 are loop invariant,
@@ -2333,46 +2290,37 @@ bool DependenceInfo::testRDIV(const SCEV *Src, const SCEV *Dst,
DstConst = DstAddRec->getStart();
DstCoeff = DstAddRec->getStepRecurrence(*SE);
DstLoop = DstAddRec->getLoop();
- }
- else if (SrcAddRec) {
+ } else if (SrcAddRec) {
if (const SCEVAddRecExpr *tmpAddRec =
- dyn_cast<SCEVAddRecExpr>(SrcAddRec->getStart())) {
+ dyn_cast<SCEVAddRecExpr>(SrcAddRec->getStart())) {
SrcConst = tmpAddRec->getStart();
SrcCoeff = tmpAddRec->getStepRecurrence(*SE);
SrcLoop = tmpAddRec->getLoop();
DstConst = Dst;
DstCoeff = SE->getNegativeSCEV(SrcAddRec->getStepRecurrence(*SE));
DstLoop = SrcAddRec->getLoop();
- }
- else
+ } else
llvm_unreachable("RDIV reached by surprising SCEVs");
- }
- else if (DstAddRec) {
+ } else if (DstAddRec) {
if (const SCEVAddRecExpr *tmpAddRec =
- dyn_cast<SCEVAddRecExpr>(DstAddRec->getStart())) {
+ dyn_cast<SCEVAddRecExpr>(DstAddRec->getStart())) {
DstConst = tmpAddRec->getStart();
DstCoeff = tmpAddRec->getStepRecurrence(*SE);
DstLoop = tmpAddRec->getLoop();
SrcConst = Src;
SrcCoeff = SE->getNegativeSCEV(DstAddRec->getStepRecurrence(*SE));
SrcLoop = DstAddRec->getLoop();
- }
- else
+ } else
llvm_unreachable("RDIV reached by surprising SCEVs");
- }
- else
+ } else
llvm_unreachable("RDIV expected at least one AddRec");
- return exactRDIVtest(SrcCoeff, DstCoeff,
- SrcConst, DstConst,
- SrcLoop, DstLoop,
+ return exactRDIVtest(SrcCoeff, DstCoeff, SrcConst, DstConst, SrcLoop, DstLoop,
Result) ||
- gcdMIVtest(Src, Dst, Result) ||
- symbolicRDIVtest(SrcCoeff, DstCoeff,
- SrcConst, DstConst,
- SrcLoop, DstLoop);
+ gcdMIVtest(Src, Dst, Result) ||
+ symbolicRDIVtest(SrcCoeff, DstCoeff, SrcConst, DstConst, SrcLoop,
+ DstLoop);
}
-
// Tests the single-subscript MIV pair (Src and Dst) for dependence.
// Return true if dependence disproved.
// Can sometimes refine direction vectors.
@@ -2383,7 +2331,7 @@ bool DependenceInfo::testMIV(const SCEV *Src, const SCEV *Dst,
LLVM_DEBUG(dbgs() << " dst = " << *Dst << "\n");
Result.Consistent = false;
return gcdMIVtest(Src, Dst, Result) ||
- banerjeeMIVtest(Src, Dst, Loops, Result);
+ banerjeeMIVtest(Src, Dst, Loops, Result);
}
// Given a product, e.g., 10*X*Y, returns the first constant operand,
@@ -2428,7 +2376,7 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,
// we can't quit the loop just because the GCD == 1.
const SCEV *Coefficients = Src;
while (const SCEVAddRecExpr *AddRec =
- dyn_cast<SCEVAddRecExpr>(Coefficients)) {
+ dyn_cast<SCEVAddRecExpr>(Coefficients)) {
const SCEV *Coeff = AddRec->getStepRecurrence(*SE);
// If the coefficient is the product of a constant and other stuff,
// we can use the constant in the GCD computation.
@@ -2446,7 +2394,7 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,
// we can't quit the loop just because the GCD == 1.
Coefficients = Dst;
while (const SCEVAddRecExpr *AddRec =
- dyn_cast<SCEVAddRecExpr>(Coefficients)) {
+ dyn_cast<SCEVAddRecExpr>(Coefficients)) {
const SCEV *Coeff = AddRec->getStepRecurrence(*SE);
// If the coefficient is the product of a constant and other stuff,
// we can use the constant in the GCD computation.
@@ -2468,16 +2416,14 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,
if (isa<SCEVConstant>(Operand)) {
assert(!Constant && "Surprised to find multiple constants");
Constant = cast<SCEVConstant>(Operand);
- }
- else if (const SCEVMulExpr *Product = dyn_cast<SCEVMulExpr>(Operand)) {
+ } else if (const SCEVMulExpr *Product = dyn_cast<SCEVMulExpr>(Operand)) {
// Search for constant operand to participate in GCD;
// If none found; return false.
std::optional<APInt> ConstOp = getConstantPart(Product);
if (!ConstOp)
return false;
ExtraGCD = APIntOps::GreatestCommonDivisor(ExtraGCD, ConstOp->abs());
- }
- else
+ } else
return false;
}
}
@@ -2512,7 +2458,7 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,
bool Improved = false;
Coefficients = Src;
while (const SCEVAddRecExpr *AddRec =
- dyn_cast<SCEVAddRecExpr>(Coefficients)) {
+ dyn_cast<SCEVAddRecExpr>(Coefficients)) {
Coefficients = AddRec->getStart();
const Loop *CurLoop = AddRec->getLoop();
RunningGCD = ExtraGCD;
@@ -2578,7 +2524,6 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,
return false;
}
-
//===----------------------------------------------------------------------===//
// banerjeeMIVtest -
// Use Banerjee's Inequalities to test an MIV subscript pair.
@@ -2652,8 +2597,8 @@ bool DependenceInfo::banerjeeMIVtest(const SCEV *Src, const SCEV *Dst,
if (testBounds(Dependence::DVEntry::ALL, 0, Bound, Delta)) {
// Explore the direction vector hierarchy.
unsigned DepthExpanded = 0;
- unsigned NewDeps = exploreDirections(1, A, B, Bound,
- Loops, DepthExpanded, Delta);
+ unsigned NewDeps =
+ exploreDirections(1, A, B, Bound, Loops, DepthExpanded, Delta);
if (NewDeps > 0) {
bool Improved = false;
for (unsigned K = 1; K <= CommonLevels; ++K) {
@@ -2670,23 +2615,20 @@ bool DependenceInfo::banerjeeMIVtest(const SCEV *Src, const SCEV *Dst,
}
if (Improved)
++BanerjeeSuccesses;
- }
- else {
+ } else {
++BanerjeeIndependence;
Disproved = true;
}
- }
- else {
+ } else {
++BanerjeeIndependence;
Disproved = true;
}
- delete [] Bound;
- delete [] A;
- delete [] B;
+ delete[] Bound;
+ delete[] A;
+ delete[] B;
return Disproved;
}
-
// Hierarchically expands the direction vector
// search space, combining the directions of discovered dependences
// in the DirSet field of Bound. Returns the number of distinct
@@ -2788,27 +2730,26 @@ unsigned DependenceInfo::exploreDirections(unsigned Level, CoefficientInfo *A,
// test bounds for <, *, *, ...
if (testBounds(Dependence::DVEntry::LT, Level, Bound, Delta))
- NewDeps += exploreDirections(Level + 1, A, B, Bound,
- Loops, DepthExpanded, Delta);
+ NewDeps += exploreDirections(Level + 1, A, B, Bound, Loops, DepthExpanded,
+ Delta);
// Test bounds for =, *, *, ...
if (testBounds(Dependence::DVEntry::EQ, Level, Bound, Delta))
- NewDeps += exploreDirections(Level + 1, A, B, Bound,
- Loops, DepthExpanded, Delta);
+ NewDeps += exploreDirections(Level + 1, A, B, Bound, Loops, DepthExpanded,
+ Delta);
// test bounds for >, *, *, ...
if (testBounds(Dependence::DVEntry::GT, Level, Bound, Delta))
- NewDeps += exploreDirections(Level + 1, A, B, Bound,
- Loops, DepthExpanded, Delta);
+ NewDeps += exploreDirections(Level + 1, A, B, Bound, Loops, DepthExpanded,
+ Delta);
Bound[Level].Direction = Dependence::DVEntry::ALL;
return NewDeps;
- }
- else
- return exploreDirections(Level + 1, A, B, Bound, Loops, DepthExpanded, Delta);
+ } else
+ return exploreDirections(Level + 1, A, B, Bound, Loops, DepthExpanded,
+ Delta);
}
-
// Returns true iff the current bounds are plausible.
bool DependenceInfo::testBounds(unsigned char DirKind, unsigned Level,
BoundInfo *Bound, const SCEV *Delta) const {
@@ -2822,7 +2763,6 @@ bool DependenceInfo::testBounds(unsigned char DirKind, unsigned Level,
return true;
}
-
// Computes the upper and lower bounds for level K
// using the * direction. Records them in Bound.
// Wolfe gives the equations
@@ -2840,17 +2780,16 @@ bool DependenceInfo::testBounds(unsigned char DirKind, unsigned Level,
// and the upper bound is always >= 0.
void DependenceInfo::findBoundsALL(CoefficientInfo *A, CoefficientInfo *B,
BoundInfo *Bound, unsigned K) const {
- Bound[K].Lower[Dependence::DVEntry::ALL] = nullptr; // Default value = -infinity.
- Bound[K].Upper[Dependence::DVEntry::ALL] = nullptr; // Default value = +infinity.
+ Bound[K].Lower[Dependence::DVEntry::ALL] =
+ nullptr; // Default value = -infinity.
+ Bound[K].Upper[Dependence::DVEntry::ALL] =
+ nullptr; // Default value = +infinity.
if (Bound[K].Iterations) {
- Bound[K].Lower[Dependence::DVEntry::ALL] =
- SE->getMulExpr(SE->getMinusSCEV(A[K].NegPart, B[K].PosPart),
- Bound[K].Iterations);
- Bound[K].Upper[Dependence::DVEntry::ALL] =
- SE->getMulExpr(SE->getMinusSCEV(A[K].PosPart, B[K].NegPart),
- Bound[K].Iterations);
- }
- else {
+ Bound[K].Lower[Dependence::DVEntry::ALL] = SE->getMulExpr(
+ SE->getMinusSCEV(A[K].NegPart, B[K].PosPart), Bound[K].Iterations);
+ Bound[K].Upper[Dependence::DVEntry::ALL] = SE->getMulExpr(
+ SE->getMinusSCEV(A[K].PosPart, B[K].NegPart), Bound[K].Iterations);
+ } else {
// If the difference is 0, we won't need to know the number of iterations.
if (isKnownPredicate(CmpInst::ICMP_EQ, A[K].NegPart, B[K].PosPart))
Bound[K].Lower[Dependence::DVEntry::ALL] =
@@ -2861,7 +2800,6 @@ void DependenceInfo::findBoundsALL(CoefficientInfo *A, CoefficientInfo *B,
}
}
-
// Computes the upper and lower bounds for level K
// using the = direction. Records them in Bound.
// Wolfe gives the equations
@@ -2879,18 +2817,19 @@ void DependenceInfo::findBoundsALL(CoefficientInfo *A, CoefficientInfo *B,
// and the upper bound is always >= 0.
void DependenceInfo::findBoundsEQ(CoefficientInfo *A, CoefficientInfo *B,
BoundInfo *Bound, unsigned K) const {
- Bound[K].Lower[Dependence::DVEntry::EQ] = nullptr; // Default value = -infinity.
- Bound[K].Upper[Dependence::DVEntry::EQ] = nullptr; // Default value = +infinity.
+ Bound[K].Lower[Dependence::DVEntry::EQ] =
+ nullptr; // Default value = -infinity.
+ Bound[K].Upper[Dependence::DVEntry::EQ] =
+ nullptr; // Default value = +infinity.
if (Bound[K].Iterations) {
const SCEV *Delta = SE->getMinusSCEV(A[K].Coeff, B[K].Coeff);
const SCEV *NegativePart = getNegativePart(Delta);
Bound[K].Lower[Dependence::DVEntry::EQ] =
- SE->getMulExpr(NegativePart, Bound[K].Iterations);
+ SE->getMulExpr(NegativePart, Bound[K].Iterations);
const SCEV *PositivePart = getPositivePart(Delta);
Bound[K].Upper[Dependence::DVEntry::EQ] =
- SE->getMulExpr(PositivePart, Bound[K].Iterations);
- }
- else {
+ SE->getMulExpr(PositivePart, Bound[K].Iterations);
+ } else {
// If the positive/negative part of the difference is 0,
// we won't need to know the number of iterations.
const SCEV *Delta = SE->getMinusSCEV(A[K].Coeff, B[K].Coeff);
@@ -2903,7 +2842,6 @@ void DependenceInfo::findBoundsEQ(CoefficientInfo *A, CoefficientInfo *B,
}
}
-
// Computes the upper and lower bounds for level K
// using the < direction. Records them in Bound.
// Wolfe gives the equations
@@ -2919,35 +2857,35 @@ void DependenceInfo::findBoundsEQ(CoefficientInfo *A, CoefficientInfo *B,
// We must be careful to handle the case where the upper bound is unknown.
void DependenceInfo::findBoundsLT(CoefficientInfo *A, CoefficientInfo *B,
BoundInfo *Bound, unsigned K) const {
- Bound[K].Lower[Dependence::DVEntry::LT] = nullptr; // Default value = -infinity.
- Bound[K].Upper[Dependence::DVEntry::LT] = nullptr; // Default value = +infinity.
+ Bound[K].Lower[Dependence::DVEntry::LT] =
+ nullptr; // Default value = -infinity.
+ Bound[K].Upper[Dependence::DVEntry::LT] =
+ nullptr; // Default value = +infinity.
if (Bound[K].Iterations) {
const SCEV *Iter_1 = SE->getMinusSCEV(
Bound[K].Iterations, SE->getOne(Bound[K].Iterations->getType()));
const SCEV *NegPart =
- getNegativePart(SE->getMinusSCEV(A[K].NegPart, B[K].Coeff));
+ getNegativePart(SE->getMinusSCEV(A[K].NegPart, B[K].Coeff));
Bound[K].Lower[Dependence::DVEntry::LT] =
- SE->getMinusSCEV(SE->getMulExpr(NegPart, Iter_1), B[K].Coeff);
+ SE->getMinusSCEV(SE->getMulExpr(NegPart, Iter_1), B[K].Coeff);
const SCEV *PosPart =
- getPositivePart(SE->getMinusSCEV(A[K].PosPart, B[K].Coeff));
+ getPositivePart(SE->getMinusSCEV(A[K].PosPart, B[K].Coeff));
Bound[K].Upper[Dependence::DVEntry::LT] =
- SE->getMinusSCEV(SE->getMulExpr(PosPart, Iter_1), B[K].Coeff);
- }
- else {
+ SE->getMinusSCEV(SE->getMulExpr(PosPart, Iter_1), B[K].Coeff);
+ } else {
// If the positive/negative part of the difference is 0,
// we won't need to know the number of iterations.
const SCEV *NegPart =
- getNegativePart(SE->getMinusSCEV(A[K].NegPart, B[K].Coeff));
+ getNegativePart(SE->getMinusSCEV(A[K].NegPart, B[K].Coeff));
if (NegPart->isZero())
Bound[K].Lower[Dependence::DVEntry::LT] = SE->getNegativeSCEV(B[K].Coeff);
const SCEV *PosPart =
- getPositivePart(SE->getMinusSCEV(A[K].PosPart, B[K].Coeff));
+ getPositivePart(SE->getMinusSCEV(A[K].PosPart, B[K].Coeff));
if (PosPart->isZero())
Bound[K].Upper[Dependence::DVEntry::LT] = SE->getNegativeSCEV(B[K].Coeff);
}
}
-
// Computes the upper and lower bounds for level K
// using the > direction. Records them in Bound.
// Wolfe gives the equations
@@ -2963,45 +2901,45 @@ void DependenceInfo::findBoundsLT(CoefficientInfo *A, CoefficientInfo *B,
// We must be careful to handle the case where the upper bound is unknown.
void DependenceInfo::findBoundsGT(CoefficientInfo *A, CoefficientInfo *B,
BoundInfo *Bound, unsigned K) const {
- Bound[K].Lower[Dependence::DVEntry::GT] = nullptr; // Default value = -infinity.
- Bound[K].Upper[Dependence::DVEntry::GT] = nullptr; // Default value = +infinity.
+ Bound[K].Lower[Dependence::DVEntry::GT] =
+ nullptr; // Default value = -infinity.
+ Bound[K].Upper[Dependence::DVEntry::GT] =
+ nullptr; // Default value = +infinity.
if (Bound[K].Iterations) {
const SCEV *Iter_1 = SE->getMinusSCEV(
Bound[K].Iterations, SE->getOne(Bound[K].Iterations->getType()));
const SCEV *NegPart =
- getNegativePart(SE->getMinusSCEV(A[K].Coeff, B[K].PosPart));
+ getNegativePart(SE->getMinusSCEV(A[K].Coeff, B[K].PosPart));
Bound[K].Lower[Dependence::DVEntry::GT] =
- SE->getAddExpr(SE->getMulExpr(NegPart, Iter_1), A[K].Coeff);
+ SE->getAddExpr(SE->getMulExpr(NegPart, Iter_1), A[K].Coeff);
const SCEV *PosPart =
- getPositivePart(SE->getMinusSCEV(A[K].Coeff, B[K].NegPart));
+ getPositivePart(SE->getMinusSCEV(A[K].Coeff, B[K].NegPart));
Bound[K].Upper[Dependence::DVEntry::GT] =
- SE->getAddExpr(SE->getMulExpr(PosPart, Iter_1), A[K].Coeff);
- }
- else {
+ SE->getAddExpr(SE->getMulExpr(PosPart, Iter_1), A[K].Coeff);
+ } else {
// If the positive/negative part of the difference is 0,
// we won't need to know the number of iterations.
- const SCEV *NegPart = getNegativePart(SE->getMinusSCEV(A[K].Coeff, B[K].PosPart));
+ const SCEV *NegPart =
+ getNegativePart(SE->getMinusSCEV(A[K].Coeff, B[K].PosPart));
if (NegPart->isZero())
Bound[K].Lower[Dependence::DVEntry::GT] = A[K].Coeff;
- const SCEV *PosPart = getPositivePart(SE->getMinusSCEV(A[K].Coeff, B[K].NegPart));
+ const SCEV *PosPart =
+ getPositivePart(SE->getMinusSCEV(A[K].Coeff, B[K].NegPart));
if (PosPart->isZero())
Bound[K].Upper[Dependence::DVEntry::GT] = A[K].Coeff;
}
}
-
// X^+ = max(X, 0)
const SCEV *DependenceInfo::getPositivePart(const SCEV *X) const {
return SE->getSMaxExpr(X, SE->getZero(X->getType()));
}
-
// X^- = min(X, 0)
const SCEV *DependenceInfo::getNegativePart(const SCEV *X) const {
return SE->getSMinExpr(X, SE->getZero(X->getType()));
}
-
// Walks through the subscript,
// collecting each coefficient, the associated loop bounds,
// and recording its positive and negative parts for later use.
@@ -3046,7 +2984,6 @@ DependenceInfo::collectCoeffInfo(const SCEV *Subscript, bool SrcFlag,
return CI;
}
-
// Looks through all the bounds info and
// computes the lower bound given the current direction settings
// at each level. If the lower bound for any level is -inf,
@@ -3062,7 +2999,6 @@ const SCEV *DependenceInfo::getLowerBound(BoundInfo *Bound) const {
return Sum;
}
-
// Looks through all the bounds info and
// computes the upper bound given the current direction settings
// at each level. If the upper bound at any level is +inf,
@@ -3078,7 +3014,6 @@ const SCEV *DependenceInfo::getUpperBound(BoundInfo *Bound) const {
return Sum;
}
-
//===----------------------------------------------------------------------===//
// Constraint manipulation for Delta test.
@@ -3098,7 +3033,6 @@ const SCEV *DependenceInfo::findCoefficient(const SCEV *Expr,
return findCoefficient(AddRec->getStart(), TargetLoop);
}
-
// Given a linear SCEV,
// return the SCEV given by zeroing out the coefficient
// corresponding to the specified loop.
@@ -3112,12 +3046,10 @@ const SCEV *DependenceInfo::zeroCoefficient(const SCEV *Expr,
if (AddRec->getLoop() == TargetLoop)
return AddRec->getStart();
return SE->getAddRecExpr(zeroCoefficient(AddRec->getStart(), TargetLoop),
- AddRec->getStepRecurrence(*SE),
- AddRec->getLoop(),
+ AddRec->getStepRecurrence(*SE), AddRec->getLoop(),
AddRec->getNoWrapFlags());
}
-
// Given a linear SCEV Expr,
// return the SCEV given by adding some Value to the
// coefficient corresponding to the specified TargetLoop.
@@ -3128,17 +3060,13 @@ const SCEV *DependenceInfo::addToCoefficient(const SCEV *Expr,
const SCEV *Value) const {
const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Expr);
if (!AddRec) // create a new addRec
- return SE->getAddRecExpr(Expr,
- Value,
- TargetLoop,
+ return SE->getAddRecExpr(Expr, Value, TargetLoop,
SCEV::FlagAnyWrap); // Worst case, with no info.
if (AddRec->getLoop() == TargetLoop) {
const SCEV *Sum = SE->getAddExpr(AddRec->getStepRecurrence(*SE), Value);
if (Sum->isZero())
return AddRec->getStart();
- return SE->getAddRecExpr(AddRec->getStart(),
- Sum,
- AddRec->getLoop(),
+ return SE->getAddRecExpr(AddRec->getStart(), Sum, AddRec->getLoop(),
AddRec->getNoWrapFlags());
}
if (SE->isLoopInvariant(AddRec, TargetLoop))
@@ -3149,7 +3077,6 @@ const SCEV *DependenceInfo::addToCoefficient(const SCEV *Expr,
AddRec->getNoWrapFlags());
}
-
// Review the constraints, looking for opportunities
// to simplify a subscript pair (Src and Dst).
// Return true if some simplification occurs.
@@ -3178,7 +3105,6 @@ bool DependenceInfo::propagate(const SCEV *&Src, const SCEV *&Dst,
return Result;
}
-
// Attempt to propagate a distance
// constraint into a subscript pair (Src and Dst).
// Return true if some simplification occurs.
@@ -3204,7 +3130,6 @@ bool DependenceInfo::propagateDistance(const SCEV *&Src, const SCEV *&Dst,
return true;
}
-
// Attempt to propagate a line
// constraint into a subscript pair (Src and Dst).
// Return true if some simplification occurs.
@@ -3224,22 +3149,22 @@ bool DependenceInfo::propagateLine(const SCEV *&Src, const SCEV *&Dst,
if (A->isZero()) {
const SCEVConstant *Bconst = dyn_cast<SCEVConstant>(B);
const SCEVConstant *Cconst = dyn_cast<SCEVConstant>(C);
- if (!Bconst || !Cconst) return false;
+ if (!Bconst || !Cconst)
+ return false;
APInt Beta = Bconst->getAPInt();
APInt Charlie = Cconst->getAPInt();
APInt CdivB = Charlie.sdiv(Beta);
assert(Charlie.srem(Beta) == 0 && "C should be evenly divisible by B");
const SCEV *AP_K = findCoefficient(Dst, CurLoop);
- // Src = SE->getAddExpr(Src, SE->getMulExpr(AP_K, SE->getConstant(CdivB)));
Src = SE->getMinusSCEV(Src, SE->getMulExpr(AP_K, SE->getConstant(CdivB)));
Dst = zeroCoefficient(Dst, CurLoop);
if (!findCoefficient(Src, CurLoop)->isZero())
Consistent = false;
- }
- else if (B->isZero()) {
+ } else if (B->isZero()) {
const SCEVConstant *Aconst = dyn_cast<SCEVConstant>(A);
const SCEVConstant *Cconst = dyn_cast<SCEVConstant>(C);
- if (!Aconst || !Cconst) return false;
+ if (!Aconst || !Cconst)
+ return false;
APInt Alpha = Aconst->getAPInt();
APInt Charlie = Cconst->getAPInt();
APInt CdivA = Charlie.sdiv(Alpha);
@@ -3249,11 +3174,11 @@ bool DependenceInfo::propagateLine(const SCEV *&Src, const SCEV *&Dst,
Src = zeroCoefficient(Src, CurLoop);
if (!findCoefficient(Dst, CurLoop)->isZero())
Consistent = false;
- }
- else if (isKnownPredicate(CmpInst::ICMP_EQ, A, B)) {
+ } else if (isKnownPredicate(CmpInst::ICMP_EQ, A, B)) {
const SCEVConstant *Aconst = dyn_cast<SCEVConstant>(A);
const SCEVConstant *Cconst = dyn_cast<SCEVConstant>(C);
- if (!Aconst || !Cconst) return false;
+ if (!Aconst || !Cconst)
+ return false;
APInt Alpha = Aconst->getAPInt();
APInt Charlie = Cconst->getAPInt();
APInt CdivA = Charlie.sdiv(Alpha);
@@ -3264,8 +3189,7 @@ bool DependenceInfo::propagateLine(const SCEV *&Src, const SCEV *&Dst,
Dst = addToCoefficient(Dst, CurLoop, A_K);
if (!findCoefficient(Dst, CurLoop)->isZero())
Consistent = false;
- }
- else {
+ } else {
// paper is incorrect here, or perhaps just misleading
const SCEV *A_K = findCoefficient(Src, CurLoop);
Src = SE->getMulExpr(Src, A);
@@ -3281,7 +3205,6 @@ bool DependenceInfo::propagateLine(const SCEV *&Src, const SCEV *&Dst,
return true;
}
-
// Attempt to propagate a point
// constraint into a subscript pair (Src and Dst).
// Return true if some simplification occurs.
@@ -3302,7 +3225,6 @@ bool DependenceInfo::propagatePoint(const SCEV *&Src, const SCEV *&Dst,
return true;
}
-
// Update direction vector entry based on the current constraint.
void DependenceInfo::updateDirection(Dependence::DVEntry &Level,
const Constraint &CurConstraint) const {
@@ -3322,34 +3244,28 @@ void DependenceInfo::updateDirection(Dependence::DVEntry &Level,
if (!SE->isKnownNonNegative(Level.Distance)) // if may be negative
NewDirection |= Dependence::DVEntry::GT;
Level.Direction &= NewDirection;
- }
- else if (CurConstraint.isLine()) {
+ } else if (CurConstraint.isLine()) {
Level.Scalar = false;
Level.Distance = nullptr;
// direction should be accurate
- }
- else if (CurConstraint.isPoint()) {
+ } else if (CurConstraint.isPoint()) {
Level.Scalar = false;
Level.Distance = nullptr;
unsigned NewDirection = Dependence::DVEntry::NONE;
- if (!isKnownPredicate(CmpInst::ICMP_NE,
- CurConstraint.getY(),
+ if (!isKnownPredicate(CmpInst::ICMP_NE, CurConstraint.getY(),
CurConstraint.getX()))
// if X may be = Y
NewDirection |= Dependence::DVEntry::EQ;
- if (!isKnownPredicate(CmpInst::ICMP_SLE,
- CurConstraint.getY(),
+ if (!isKnownPredicate(CmpInst::ICMP_SLE, CurConstraint.getY(),
CurConstraint.getX()))
// if Y may be > X
NewDirection |= Dependence::DVEntry::LT;
- if (!isKnownPredicate(CmpInst::ICMP_SGE,
- CurConstraint.getY(),
+ if (!isKnownPredicate(CmpInst::ICMP_SGE, CurConstraint.getY(),
CurConstraint.getX()))
// if Y may be < X
NewDirection |= Dependence::DVEntry::GT;
Level.Direction &= NewDirection;
- }
- else
+ } else
llvm_unreachable("constraint has unexpected kind");
}
@@ -3425,7 +3341,7 @@ bool DependenceInfo::tryDelinearizeFixedSize(
dyn_cast<SCEVUnknown>(SE->getPointerBase(DstAccessFn));
assert(SrcBase && DstBase && SrcBase == DstBase &&
"expected src and dst scev unknowns to be equal");
- });
+ });
SmallVector<int, 4> SrcSizes;
SmallVector<int, 4> DstSizes;
@@ -3737,9 +3653,8 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
Pair[P].Group.resize(Pairs);
removeMatchingExtensions(&Pair[P]);
Pair[P].Classification =
- classifyPair(Pair[P].Src, LI->getLoopFor(Src->getParent()),
- Pair[P].Dst, LI->getLoopFor(Dst->getParent()),
- Pair[P].Loops);
+ classifyPair(Pair[P].Src, LI->getLoopFor(Src->getParent()), Pair[P].Dst,
+ LI->getLoopFor(Dst->getParent()), Pair[P].Loops);
Pair[P].GroupLoops = Pair[P].Loops;
Pair[P].Group.set(P);
LLVM_DEBUG(dbgs() << " subscript " << P << "\n");
@@ -3814,18 +3729,15 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
if (Pair[SI].Classification == Subscript::NonLinear) {
// ignore these, but collect loops for later
++NonlinearSubscriptPairs;
- collectCommonLoops(Pair[SI].Src,
- LI->getLoopFor(Src->getParent()),
+ collectCommonLoops(Pair[SI].Src, LI->getLoopFor(Src->getParent()),
Pair[SI].Loops);
- collectCommonLoops(Pair[SI].Dst,
- LI->getLoopFor(Dst->getParent()),
+ collectCommonLoops(Pair[SI].Dst, LI->getLoopFor(Dst->getParent()),
Pair[SI].Loops);
Result.Consistent = false;
} else if (Pair[SI].Classification == Subscript::ZIV) {
// always separable
Separable.set(SI);
- }
- else {
+ } else {
// SIV, RDIV, or MIV, so check for coupled group
bool Done = true;
for (unsigned SJ = SI + 1; SJ < Pairs; ++SJ) {
@@ -3843,8 +3755,7 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
if (Pair[SI].Group.count() == 1) {
Separable.set(SI);
++SeparableSubscriptPairs;
- }
- else {
+ } else {
Coupled.set(SI);
++CoupledSubscriptPairs;
}
@@ -3950,10 +3861,9 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
Constraints, Result.Consistent)) {
LLVM_DEBUG(dbgs() << "\t Changed\n");
++DeltaPropagations;
- Pair[SJ].Classification =
- classifyPair(Pair[SJ].Src, LI->getLoopFor(Src->getParent()),
- Pair[SJ].Dst, LI->getLoopFor(Dst->getParent()),
- Pair[SJ].Loops);
+ Pair[SJ].Classification = classifyPair(
+ Pair[SJ].Src, LI->getLoopFor(Src->getParent()), Pair[SJ].Dst,
+ LI->getLoopFor(Dst->getParent()), Pair[SJ].Loops);
switch (Pair[SJ].Classification) {
case Subscript::ZIV:
LLVM_DEBUG(dbgs() << "ZIV\n");
@@ -3995,8 +3905,7 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
LLVM_DEBUG(dbgs() << "MIV test\n");
if (testMIV(Pair[SJ].Src, Pair[SJ].Dst, Pair[SJ].Loops, Result))
return nullptr;
- }
- else
+ } else
llvm_unreachable("expected only MIV subscripts at this point");
}
@@ -4052,8 +3961,7 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
break;
}
}
- }
- else {
+ } else {
// On the other hand, if all directions are equal and there's no
// loop-independent dependence possible, then no dependence exists.
bool AllEqual = true;
@@ -4158,9 +4066,8 @@ const SCEV *DependenceInfo::getSplitIteration(const Dependence &Dep,
Pair[P].Group.resize(Pairs);
removeMatchingExtensions(&Pair[P]);
Pair[P].Classification =
- classifyPair(Pair[P].Src, LI->getLoopFor(Src->getParent()),
- Pair[P].Dst, LI->getLoopFor(Dst->getParent()),
- Pair[P].Loops);
+ classifyPair(Pair[P].Src, LI->getLoopFor(Src->getParent()), Pair[P].Dst,
+ LI->getLoopFor(Dst->getParent()), Pair[P].Loops);
Pair[P].GroupLoops = Pair[P].Loops;
Pair[P].Group.set(P);
}
@@ -4172,15 +4079,12 @@ const SCEV *DependenceInfo::getSplitIteration(const Dependence &Dep,
for (unsigned SI = 0; SI < Pairs; ++SI) {
if (Pair[SI].Classification == Subscript::NonLinear) {
// ignore these, but collect loops for later
- collectCommonLoops(Pair[SI].Src,
- LI->getLoopFor(Src->getParent()),
+ collectCommonLoops(Pair[SI].Src, LI->getLoopFor(Src->getParent()),
Pair[SI].Loops);
- collectCommonLoops(Pair[SI].Dst,
- LI->getLoopFor(Dst->getParent()),
+ collectCommonLoops(Pair[SI].Dst, LI->getLoopFor(Dst->getParent()),
Pair[SI].Loops);
Result.Consistent = false;
- }
- else if (Pair[SI].Classification == Subscript::ZIV)
+ } else if (Pair[SI].Classification == Subscript::ZIV)
Separable.set(SI);
else {
// SIV, RDIV, or MIV, so check for coupled group
@@ -4214,8 +4118,8 @@ const SCEV *DependenceInfo::getSplitIteration(const Dependence &Dep,
case Subscript::SIV: {
unsigned Level;
const SCEV *SplitIter = nullptr;
- (void) testSIV(Pair[SI].Src, Pair[SI].Dst, Level,
- Result, NewConstraint, SplitIter);
+ (void)testSIV(Pair[SI].Src, Pair[SI].Dst, Level, Result, NewConstraint,
+ SplitIter);
if (Level == SplitLevel) {
assert(SplitIter != nullptr);
return SplitIter;
diff --git a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
index 2b0f212..67c2cfa 100644
--- a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -150,6 +150,10 @@ static ModRefInfo GetLocation(const Instruction *Inst, MemoryLocation &Loc,
switch (II->getIntrinsicID()) {
case Intrinsic::lifetime_start:
case Intrinsic::lifetime_end:
+ Loc = MemoryLocation::getForArgument(II, 0, TLI);
+ // These intrinsics don't really modify the memory, but returning Mod
+ // will allow them to be handled conservatively.
+ return ModRefInfo::Mod;
case Intrinsic::invariant_start:
Loc = MemoryLocation::getForArgument(II, 1, TLI);
// These intrinsics don't really modify the memory, but returning Mod
@@ -441,11 +445,7 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom(
Intrinsic::ID ID = II->getIntrinsicID();
switch (ID) {
case Intrinsic::lifetime_start: {
- // FIXME: This only considers queries directly on the invariant-tagged
- // pointer, not on query pointers that are indexed off of them. It'd
- // be nice to handle that at some point (the right approach is to use
- // GetPointerBaseWithConstantOffset).
- MemoryLocation ArgLoc = MemoryLocation::getAfter(II->getArgOperand(1));
+ MemoryLocation ArgLoc = MemoryLocation::getAfter(II->getArgOperand(0));
if (BatchAA.isMustAlias(ArgLoc, MemLoc))
return MemDepResult::getDef(II);
continue;
diff --git a/llvm/lib/Analysis/MemoryLocation.cpp b/llvm/lib/Analysis/MemoryLocation.cpp
index 28a2640..72b643c 100644
--- a/llvm/lib/Analysis/MemoryLocation.cpp
+++ b/llvm/lib/Analysis/MemoryLocation.cpp
@@ -191,7 +191,7 @@ MemoryLocation MemoryLocation::getForArgument(const CallBase *Call,
case Intrinsic::lifetime_start:
case Intrinsic::lifetime_end: {
- assert(ArgIdx == 1 && "Invalid argument index");
+ assert(ArgIdx == 0 && "Invalid argument index");
auto *AI = dyn_cast<AllocaInst>(Arg);
if (!AI)
// lifetime of poison value.
diff --git a/llvm/lib/Analysis/StackLifetime.cpp b/llvm/lib/Analysis/StackLifetime.cpp
index abe4985..1e20fca 100644
--- a/llvm/lib/Analysis/StackLifetime.cpp
+++ b/llvm/lib/Analysis/StackLifetime.cpp
@@ -70,7 +70,7 @@ void StackLifetime::collectMarkers() {
const IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
if (!II || !II->isLifetimeStartOrEnd())
continue;
- const AllocaInst *AI = dyn_cast<AllocaInst>(II->getArgOperand(1));
+ const AllocaInst *AI = dyn_cast<AllocaInst>(II->getArgOperand(0));
if (!AI)
continue;
auto It = AllocaNumbering.find(AI);
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 1e70228..b0e4b00 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -9147,7 +9147,8 @@ static bool matchTwoInputRecurrence(const PHINode *PN, InstTy *&Inst,
return false;
for (unsigned I = 0; I != 2; ++I) {
- if (auto *Operation = dyn_cast<InstTy>(PN->getIncomingValue(I))) {
+ if (auto *Operation = dyn_cast<InstTy>(PN->getIncomingValue(I));
+ Operation && Operation->getNumOperands() >= 2) {
Value *LHS = Operation->getOperand(0);
Value *RHS = Operation->getOperand(1);
if (LHS != PN && RHS != PN)
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index b3b4c37..425ea31 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -81,6 +81,7 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
case Intrinsic::exp:
case Intrinsic::exp10:
case Intrinsic::exp2:
+ case Intrinsic::ldexp:
case Intrinsic::log:
case Intrinsic::log10:
case Intrinsic::log2:
@@ -108,6 +109,8 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
case Intrinsic::canonicalize:
case Intrinsic::fptosi_sat:
case Intrinsic::fptoui_sat:
+ case Intrinsic::lround:
+ case Intrinsic::llround:
case Intrinsic::lrint:
case Intrinsic::llrint:
case Intrinsic::ucmp:
@@ -189,6 +192,8 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(
switch (ID) {
case Intrinsic::fptosi_sat:
case Intrinsic::fptoui_sat:
+ case Intrinsic::lround:
+ case Intrinsic::llround:
case Intrinsic::lrint:
case Intrinsic::llrint:
case Intrinsic::vp_lrint:
@@ -203,6 +208,7 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(
case Intrinsic::vp_is_fpclass:
return OpdIdx == 0;
case Intrinsic::powi:
+ case Intrinsic::ldexp:
return OpdIdx == -1 || OpdIdx == 1;
default:
return OpdIdx == -1;
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index 520c6a0..3d5bd61 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -928,6 +928,7 @@ lltok::Kind LLLexer::LexIdentifier() {
INSTKEYWORD(fptoui, FPToUI);
INSTKEYWORD(fptosi, FPToSI);
INSTKEYWORD(inttoptr, IntToPtr);
+ INSTKEYWORD(ptrtoaddr, PtrToAddr);
INSTKEYWORD(ptrtoint, PtrToInt);
INSTKEYWORD(bitcast, BitCast);
INSTKEYWORD(addrspacecast, AddrSpaceCast);
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 13bef1f..1bc2906 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -4273,6 +4273,7 @@ bool LLParser::parseValID(ValID &ID, PerFunctionState *PFS, Type *ExpectedTy) {
case lltok::kw_bitcast:
case lltok::kw_addrspacecast:
case lltok::kw_inttoptr:
+ case lltok::kw_ptrtoaddr:
case lltok::kw_ptrtoint: {
unsigned Opc = Lex.getUIntVal();
Type *DestTy = nullptr;
@@ -7310,6 +7311,7 @@ int LLParser::parseInstruction(Instruction *&Inst, BasicBlock *BB,
case lltok::kw_fptoui:
case lltok::kw_fptosi:
case lltok::kw_inttoptr:
+ case lltok::kw_ptrtoaddr:
case lltok::kw_ptrtoint:
return parseCast(Inst, PFS, KeywordVal);
case lltok::kw_fptrunc:
diff --git a/llvm/lib/BinaryFormat/DXContainer.cpp b/llvm/lib/BinaryFormat/DXContainer.cpp
index 36d10d0..eb83945 100644
--- a/llvm/lib/BinaryFormat/DXContainer.cpp
+++ b/llvm/lib/BinaryFormat/DXContainer.cpp
@@ -60,6 +60,17 @@ ArrayRef<EnumEntry<SigComponentType>> dxbc::getSigComponentTypes() {
return ArrayRef(SigComponentTypes);
}
+static const EnumEntry<dxil::ResourceClass> ResourceClassNames[] = {
+ {"SRV", llvm::dxil::ResourceClass::SRV},
+ {"UAV", llvm::dxil::ResourceClass::UAV},
+ {"CBV", llvm::dxil::ResourceClass::CBuffer},
+ {"Sampler", llvm::dxil::ResourceClass::Sampler},
+};
+
+ArrayRef<EnumEntry<llvm::dxil::ResourceClass>> dxbc::getResourceClasses() {
+ return ArrayRef(ResourceClassNames);
+}
+
static const EnumEntry<RootFlags> RootFlagNames[] = {
#define ROOT_SIGNATURE_FLAG(Val, Enum) {#Enum, RootFlags::Enum},
#include "llvm/BinaryFormat/DXContainerConstants.def"
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 290d873..22a0d0f 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -1283,6 +1283,7 @@ static int getDecodedCastOpcode(unsigned Val) {
case bitc::CAST_SITOFP : return Instruction::SIToFP;
case bitc::CAST_FPTRUNC : return Instruction::FPTrunc;
case bitc::CAST_FPEXT : return Instruction::FPExt;
+ case bitc::CAST_PTRTOADDR: return Instruction::PtrToAddr;
case bitc::CAST_PTRTOINT: return Instruction::PtrToInt;
case bitc::CAST_INTTOPTR: return Instruction::IntToPtr;
case bitc::CAST_BITCAST : return Instruction::BitCast;
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 05680fa..a3f8254 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -647,6 +647,7 @@ static unsigned getEncodedCastOpcode(unsigned Opcode) {
case Instruction::SIToFP : return bitc::CAST_SITOFP;
case Instruction::FPTrunc : return bitc::CAST_FPTRUNC;
case Instruction::FPExt : return bitc::CAST_FPEXT;
+ case Instruction::PtrToAddr: return bitc::CAST_PTRTOADDR;
case Instruction::PtrToInt: return bitc::CAST_PTRTOINT;
case Instruction::IntToPtr: return bitc::CAST_INTTOPTR;
case Instruction::BitCast : return bitc::CAST_BITCAST;
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index c72b6e8..23a3543 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -3657,6 +3657,7 @@ const MCExpr *AsmPrinter::lowerConstant(const Constant *CV,
break; // Error
}
+ case Instruction::PtrToAddr:
case Instruction::PtrToInt: {
const DataLayout &DL = getDataLayout();
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 3f3d5dc9..278dd65 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -1915,7 +1915,6 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall(
// TODO: the "order" argument type is "int", not int32. So
// getInt32Ty may be wrong if the arch uses e.g. 16-bit ints.
- ConstantInt *SizeVal64 = ConstantInt::get(Type::getInt64Ty(Ctx), Size);
assert(Ordering != AtomicOrdering::NotAtomic && "expect atomic MO");
Constant *OrderingVal =
ConstantInt::get(Type::getInt32Ty(Ctx), (int)toCABI(Ordering));
@@ -2012,7 +2011,7 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall(
if (CASExpected) {
AllocaCASExpected = AllocaBuilder.CreateAlloca(CASExpected->getType());
AllocaCASExpected->setAlignment(AllocaAlignment);
- Builder.CreateLifetimeStart(AllocaCASExpected, SizeVal64);
+ Builder.CreateLifetimeStart(AllocaCASExpected);
Builder.CreateAlignedStore(CASExpected, AllocaCASExpected, AllocaAlignment);
Args.push_back(AllocaCASExpected);
}
@@ -2026,7 +2025,7 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall(
} else {
AllocaValue = AllocaBuilder.CreateAlloca(ValueOperand->getType());
AllocaValue->setAlignment(AllocaAlignment);
- Builder.CreateLifetimeStart(AllocaValue, SizeVal64);
+ Builder.CreateLifetimeStart(AllocaValue);
Builder.CreateAlignedStore(ValueOperand, AllocaValue, AllocaAlignment);
Args.push_back(AllocaValue);
}
@@ -2036,7 +2035,7 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall(
if (!CASExpected && HasResult && !UseSizedLibcall) {
AllocaResult = AllocaBuilder.CreateAlloca(I->getType());
AllocaResult->setAlignment(AllocaAlignment);
- Builder.CreateLifetimeStart(AllocaResult, SizeVal64);
+ Builder.CreateLifetimeStart(AllocaResult);
Args.push_back(AllocaResult);
}
@@ -2069,7 +2068,7 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall(
// And then, extract the results...
if (ValueOperand && !UseSizedLibcall)
- Builder.CreateLifetimeEnd(AllocaValue, SizeVal64);
+ Builder.CreateLifetimeEnd(AllocaValue);
if (CASExpected) {
// The final result from the CAS is {load of 'expected' alloca, bool result
@@ -2078,7 +2077,7 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall(
Value *V = PoisonValue::get(FinalResultTy);
Value *ExpectedOut = Builder.CreateAlignedLoad(
CASExpected->getType(), AllocaCASExpected, AllocaAlignment);
- Builder.CreateLifetimeEnd(AllocaCASExpected, SizeVal64);
+ Builder.CreateLifetimeEnd(AllocaCASExpected);
V = Builder.CreateInsertValue(V, ExpectedOut, 0);
V = Builder.CreateInsertValue(V, Result, 1);
I->replaceAllUsesWith(V);
@@ -2089,7 +2088,7 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall(
else {
V = Builder.CreateAlignedLoad(I->getType(), AllocaResult,
AllocaAlignment);
- Builder.CreateLifetimeEnd(AllocaResult, SizeVal64);
+ Builder.CreateLifetimeEnd(AllocaResult);
}
I->replaceAllUsesWith(V);
}
diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index 9ba1782..0f3ec8b 100644
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -132,9 +132,10 @@ bool CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const CallBase &CB,
unsigned i = 0;
unsigned NumFixedArgs = CB.getFunctionType()->getNumParams();
for (const auto &Arg : CB.args()) {
- ArgInfo OrigArg{ArgRegs[i], *Arg.get(), i, getAttributesForArgIdx(CB, i),
- i < NumFixedArgs};
+ ArgInfo OrigArg{ArgRegs[i], *Arg.get(), i, getAttributesForArgIdx(CB, i)};
setArgFlags(OrigArg, i + AttributeList::FirstArgIndex, DL, CB);
+ if (i >= NumFixedArgs)
+ OrigArg.Flags[0].setVarArg();
// If we have an explicit sret argument that is an Instruction, (i.e., it
// might point to function-local memory), we can't meaningfully tail-call.
@@ -301,7 +302,7 @@ void CallLowering::splitToValueTypes(const ArgInfo &OrigArg,
// double] -> double).
SplitArgs.emplace_back(OrigArg.Regs[0], SplitVTs[0].getTypeForEVT(Ctx),
OrigArg.OrigArgIndex, OrigArg.Flags[0],
- OrigArg.IsFixed, OrigArg.OrigValue);
+ OrigArg.OrigValue);
return;
}
@@ -313,7 +314,7 @@ void CallLowering::splitToValueTypes(const ArgInfo &OrigArg,
for (unsigned i = 0, e = SplitVTs.size(); i < e; ++i) {
Type *SplitTy = SplitVTs[i].getTypeForEVT(Ctx);
SplitArgs.emplace_back(OrigArg.Regs[i], SplitTy, OrigArg.OrigArgIndex,
- OrigArg.Flags[0], OrigArg.IsFixed);
+ OrigArg.Flags[0]);
if (NeedsRegBlock)
SplitArgs.back().Flags[0].setInConsecutiveRegs();
}
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index bbfae57..d30dfa7 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -2209,7 +2209,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
unsigned Op = ID == Intrinsic::lifetime_start ? TargetOpcode::LIFETIME_START
: TargetOpcode::LIFETIME_END;
- const AllocaInst *AI = dyn_cast<AllocaInst>(CI.getArgOperand(1));
+ const AllocaInst *AI = dyn_cast<AllocaInst>(CI.getArgOperand(0));
if (!AI || !AI->isStaticAlloca())
return true;
diff --git a/llvm/lib/CodeGen/RegisterPressure.cpp b/llvm/lib/CodeGen/RegisterPressure.cpp
index ca51b67..5f37890 100644
--- a/llvm/lib/CodeGen/RegisterPressure.cpp
+++ b/llvm/lib/CodeGen/RegisterPressure.cpp
@@ -1001,7 +1001,7 @@ static void computeMaxPressureDelta(ArrayRef<unsigned> OldMaxPressureVec,
++CritIdx;
if (CritIdx != CritEnd && CriticalPSets[CritIdx].getPSet() == i) {
- int PDiff = (int)PNew - (int)CriticalPSets[CritIdx].getUnitInc();
+ int PDiff = (int)PNew - CriticalPSets[CritIdx].getUnitInc();
if (PDiff > 0) {
Delta.CriticalMax = PressureChange(i);
Delta.CriticalMax.setUnitInc(PDiff);
@@ -1191,7 +1191,7 @@ getUpwardPressureDelta(const MachineInstr *MI, /*const*/ PressureDiff &PDiff,
++CritIdx;
if (CritIdx != CritEnd && CriticalPSets[CritIdx].getPSet() == PSetID) {
- int CritInc = (int)MNew - (int)CriticalPSets[CritIdx].getUnitInc();
+ int CritInc = (int)MNew - CriticalPSets[CritIdx].getUnitInc();
if (CritInc > 0 && CritInc <= std::numeric_limits<int16_t>::max()) {
Delta.CriticalMax = PressureChange(PSetID);
Delta.CriticalMax.setUnitInc(CritInc);
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 7341914..17703f5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -12843,22 +12843,21 @@ SDValue DAGCombiner::visitMHISTOGRAM(SDNode *N) {
SDLoc DL(HG);
EVT MemVT = HG->getMemoryVT();
+ EVT DataVT = Index.getValueType();
MachineMemOperand *MMO = HG->getMemOperand();
ISD::MemIndexType IndexType = HG->getIndexType();
if (ISD::isConstantSplatVectorAllZeros(Mask.getNode()))
return Chain;
- SDValue Ops[] = {Chain, Inc, Mask, BasePtr, Index,
- HG->getScale(), HG->getIntID()};
- if (refineUniformBase(BasePtr, Index, HG->isIndexScaled(), DAG, DL))
+ if (refineUniformBase(BasePtr, Index, HG->isIndexScaled(), DAG, DL) ||
+ refineIndexType(Index, IndexType, DataVT, DAG)) {
+ SDValue Ops[] = {Chain, Inc, Mask, BasePtr, Index,
+ HG->getScale(), HG->getIntID()};
return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), MemVT, DL, Ops,
MMO, IndexType);
+ }
- EVT DataVT = Index.getValueType();
- if (refineIndexType(Index, IndexType, DataVT, DAG))
- return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), MemVT, DL, Ops,
- MMO, IndexType);
return SDValue();
}
@@ -16343,6 +16342,38 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
DAG, DL);
}
break;
+ case ISD::ABDU:
+ case ISD::ABDS:
+ // (trunc (abdu/abds a, b)) → (abdu/abds (trunc a), (trunc b))
+ if (!LegalOperations || N0.hasOneUse()) {
+ EVT SrcVT = N0.getValueType();
+ EVT TruncVT = VT;
+ unsigned SrcBits = SrcVT.getScalarSizeInBits();
+ unsigned TruncBits = TruncVT.getScalarSizeInBits();
+ unsigned NeededBits = SrcBits - TruncBits;
+
+ SDValue A = N0.getOperand(0);
+ SDValue B = N0.getOperand(1);
+ bool CanFold = false;
+
+ if (N0.getOpcode() == ISD::ABDU) {
+ KnownBits KnownA = DAG.computeKnownBits(A);
+ KnownBits KnownB = DAG.computeKnownBits(B);
+ CanFold = KnownA.countMinLeadingZeros() >= NeededBits &&
+ KnownB.countMinLeadingZeros() >= NeededBits;
+ } else {
+ unsigned SignBitsA = DAG.ComputeNumSignBits(A);
+ unsigned SignBitsB = DAG.ComputeNumSignBits(B);
+ CanFold = SignBitsA > NeededBits && SignBitsB > NeededBits;
+ }
+
+ if (CanFold && TLI.isOperationLegal(N0.getOpcode(), VT)) {
+ SDValue NewA = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, A);
+ SDValue NewB = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, B);
+ return DAG.getNode(N0.getOpcode(), DL, TruncVT, NewA, NewB);
+ }
+ }
+ break;
}
return SDValue();
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 649a310..5ef1746 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -1371,7 +1371,7 @@ void SelectionDAG::init(MachineFunction &NewMF,
const TargetLibraryInfo *LibraryInfo,
UniformityInfo *NewUA, ProfileSummaryInfo *PSIin,
BlockFrequencyInfo *BFIin, MachineModuleInfo &MMIin,
- FunctionVarLocs const *VarLocs, bool HasDivergency) {
+ FunctionVarLocs const *VarLocs) {
MF = &NewMF;
SDAGISelPass = PassPtr;
ORE = &NewORE;
@@ -1384,7 +1384,6 @@ void SelectionDAG::init(MachineFunction &NewMF,
BFI = BFIin;
MMI = &MMIin;
FnVarLocs = VarLocs;
- DivergentTarget = HasDivergency;
}
SelectionDAG::~SelectionDAG() {
@@ -2331,8 +2330,7 @@ SDValue SelectionDAG::getRegister(Register Reg, EVT VT) {
return SDValue(E, 0);
auto *N = newSDNode<RegisterSDNode>(Reg, VTs);
- N->SDNodeBits.IsDivergent =
- DivergentTarget && TLI->isSDNodeSourceOfDivergence(N, FLI, UA);
+ N->SDNodeBits.IsDivergent = TLI->isSDNodeSourceOfDivergence(N, FLI, UA);
CSEMap.InsertNode(N, IP);
InsertNode(N);
return SDValue(N, 0);
@@ -5630,6 +5628,7 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
case ISD::FDIV:
case ISD::FREM:
case ISD::FCOPYSIGN:
+ case ISD::FP_EXTEND:
// No poison except from flags (which is handled above)
return false;
@@ -8888,6 +8887,44 @@ static void checkAddrSpaceIsValidForLibcall(const TargetLowering *TLI,
}
}
+std::pair<SDValue, SDValue>
+SelectionDAG::getMemcmp(SDValue Chain, const SDLoc &dl, SDValue Mem0,
+ SDValue Mem1, SDValue Size, const CallInst *CI) {
+ const char *LibCallName = TLI->getLibcallName(RTLIB::MEMCMP);
+ if (!LibCallName)
+ return {};
+
+ // Emit a library call.
+ auto GetEntry = [](Type *Ty, SDValue &SDV) {
+ TargetLowering::ArgListEntry E;
+ E.Ty = Ty;
+ E.Node = SDV;
+ return E;
+ };
+
+ PointerType *PT = PointerType::getUnqual(*getContext());
+ TargetLowering::ArgListTy Args = {
+ GetEntry(PT, Mem0), GetEntry(PT, Mem1),
+ GetEntry(getDataLayout().getIntPtrType(*getContext()), Size)};
+
+ TargetLowering::CallLoweringInfo CLI(*this);
+ bool IsTailCall = false;
+ bool ReturnsFirstArg = CI && funcReturnsFirstArgOfCall(*CI);
+ IsTailCall = CI && CI->isTailCall() &&
+ isInTailCallPosition(*CI, getTarget(), ReturnsFirstArg);
+
+ CLI.setDebugLoc(dl)
+ .setChain(Chain)
+ .setLibCallee(
+ TLI->getLibcallCallingConv(RTLIB::MEMCMP),
+ Type::getInt32Ty(*getContext()),
+ getExternalSymbol(LibCallName, TLI->getPointerTy(getDataLayout())),
+ std::move(Args))
+ .setTailCall(IsTailCall);
+
+ return TLI->LowerCallTo(CLI);
+}
+
SDValue SelectionDAG::getMemcpy(
SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size,
Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI,
@@ -12225,8 +12262,6 @@ static bool gluePropagatesDivergence(const SDNode *Node) {
}
bool SelectionDAG::calculateDivergence(SDNode *N) {
- if (!DivergentTarget)
- return false;
if (TLI->isSDNodeAlwaysUniform(N)) {
assert(!TLI->isSDNodeSourceOfDivergence(N, FLI, UA) &&
"Conflicting divergence information!");
@@ -12246,8 +12281,6 @@ bool SelectionDAG::calculateDivergence(SDNode *N) {
}
void SelectionDAG::updateDivergence(SDNode *N) {
- if (!DivergentTarget)
- return;
SmallVector<SDNode *, 16> Worklist(1, N);
do {
N = Worklist.pop_back_val();
@@ -13808,20 +13841,16 @@ void SelectionDAG::createOperands(SDNode *Node, ArrayRef<SDValue> Vals) {
Ops[I].setInitial(Vals[I]);
EVT VT = Ops[I].getValueType();
- // Take care of the Node's operands iff target has divergence
// Skip Chain. It does not carry divergence.
- if (DivergentTarget && VT != MVT::Other &&
+ if (VT != MVT::Other &&
(VT != MVT::Glue || gluePropagatesDivergence(Ops[I].getNode())) &&
Ops[I].getNode()->isDivergent()) {
- // Node is going to be divergent if at least one of its operand is
- // divergent, unless it belongs to the "AlwaysUniform" exemptions.
IsDivergent = true;
}
}
Node->NumOperands = Vals.size();
Node->OperandList = Ops;
- // Check the divergence of the Node itself.
- if (DivergentTarget && !TLI->isSDNodeAlwaysUniform(Node)) {
+ if (!TLI->isSDNodeAlwaysUniform(Node)) {
IsDivergent |= TLI->isSDNodeSourceOfDivergence(Node, FLI, UA);
Node->SDNodeBits.IsDivergent = IsDivergent;
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index d0815e9..48ab797 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -2273,9 +2273,8 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
Flags.setNoExt();
for (unsigned i = 0; i < NumParts; ++i) {
- Outs.push_back(ISD::OutputArg(Flags,
- Parts[i].getValueType().getSimpleVT(),
- VT, /*isfixed=*/true, 0, 0));
+ Outs.push_back(ISD::OutputArg(
+ Flags, Parts[i].getValueType().getSimpleVT(), VT, 0, 0));
OutVals.push_back(Parts[i]);
}
}
@@ -2291,9 +2290,9 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
assert(SwiftError.getFunctionArg() && "Need a swift error argument");
ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
Flags.setSwiftError();
- Outs.push_back(ISD::OutputArg(
- Flags, /*vt=*/TLI.getPointerTy(DL), /*argvt=*/EVT(TLI.getPointerTy(DL)),
- /*isfixed=*/true, /*origidx=*/1, /*partOffs=*/0));
+ Outs.push_back(ISD::OutputArg(Flags, /*vt=*/TLI.getPointerTy(DL),
+ /*argvt=*/EVT(TLI.getPointerTy(DL)),
+ /*origidx=*/1, /*partOffs=*/0));
// Create SDNode for the swifterror virtual register.
OutVals.push_back(
DAG.getRegister(SwiftError.getOrCreateVRegUseAt(
@@ -3978,6 +3977,11 @@ void SelectionDAGBuilder::visitSIToFP(const User &I) {
setValue(&I, DAG.getNode(ISD::SINT_TO_FP, getCurSDLoc(), DestVT, N));
}
+void SelectionDAGBuilder::visitPtrToAddr(const User &I) {
+ // FIXME: this is not correct for pointers with addr width != pointer width
+ visitPtrToInt(I);
+}
+
void SelectionDAGBuilder::visitPtrToInt(const User &I) {
// What to do depends on the size of the integer and the size of the pointer.
// We can either truncate, zero extend, or no-op, accordingly.
@@ -7598,7 +7602,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
if (TM.getOptLevel() == CodeGenOptLevel::None)
return;
- const AllocaInst *LifetimeObject = dyn_cast<AllocaInst>(I.getArgOperand(1));
+ const AllocaInst *LifetimeObject = dyn_cast<AllocaInst>(I.getArgOperand(0));
if (!LifetimeObject)
return;
@@ -9091,7 +9095,7 @@ bool SelectionDAGBuilder::visitMemCmpBCmpCall(const CallInst &I) {
const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
std::pair<SDValue, SDValue> Res = TSI.EmitTargetCodeForMemcmp(
DAG, getCurSDLoc(), DAG.getRoot(), getValue(LHS), getValue(RHS),
- getValue(Size), MachinePointerInfo(LHS), MachinePointerInfo(RHS));
+ getValue(Size), &I);
if (Res.first.getNode()) {
processIntegerCallValue(I, Res.first, true);
PendingLoads.push_back(Res.second);
@@ -11124,6 +11128,8 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
const Align OriginalAlignment(getABIAlignmentForCallingConv(ArgTy, DL));
Flags.setOrigAlign(OriginalAlignment);
+ if (i >= CLI.NumFixedArgs)
+ Flags.setVarArg();
if (Args[i].Ty->isPointerTy()) {
Flags.setPointer();
Flags.setPointerAddrSpace(
@@ -11246,8 +11252,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
// For scalable vectors the scalable part is currently handled
// by individual targets, so we just use the known minimum size here.
ISD::OutputArg MyFlags(
- Flags, Parts[j].getValueType().getSimpleVT(), VT,
- i < CLI.NumFixedArgs, i,
+ Flags, Parts[j].getValueType().getSimpleVT(), VT, i,
j * Parts[j].getValueType().getStoreSize().getKnownMinValue());
if (NumParts > 1 && j == 0)
MyFlags.Flags.setSplit();
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index c251755..e0835e6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -574,6 +574,7 @@ private:
void visitFPToSI(const User &I);
void visitUIToFP(const User &I);
void visitSIToFP(const User &I);
+ void visitPtrToAddr(const User &I);
void visitPtrToInt(const User &I);
void visitIntToPtr(const User &I);
void visitBitCast(const User &I);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 26071ed..ece50ed 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -480,10 +480,7 @@ void SelectionDAGISel::initializeAnalysisResults(
MachineModuleInfo &MMI =
MAMP.getCachedResult<MachineModuleAnalysis>(*Fn.getParent())->getMMI();
- TTI = &FAM.getResult<TargetIRAnalysis>(Fn);
-
- CurDAG->init(*MF, *ORE, MFAM, LibInfo, UA, PSI, BFI, MMI, FnVarLocs,
- TTI->hasBranchDivergence(&Fn));
+ CurDAG->init(*MF, *ORE, MFAM, LibInfo, UA, PSI, BFI, MMI, FnVarLocs);
// Now get the optional analyzes if we want to.
// This is based on the possibly changed OptLevel (after optnone is taken
@@ -501,6 +498,10 @@ void SelectionDAGISel::initializeAnalysisResults(
BatchAA = std::nullopt;
SP = &FAM.getResult<SSPLayoutAnalysis>(Fn);
+
+#if !defined(NDEBUG) && LLVM_ENABLE_ABI_BREAKING_CHECKS
+ TTI = &FAM.getResult<TargetIRAnalysis>(Fn);
+#endif
}
void SelectionDAGISel::initializeAnalysisResults(MachineFunctionPass &MFP) {
@@ -536,10 +537,7 @@ void SelectionDAGISel::initializeAnalysisResults(MachineFunctionPass &MFP) {
MachineModuleInfo &MMI =
MFP.getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
- TTI = &MFP.getAnalysis<TargetTransformInfoWrapperPass>().getTTI(Fn);
-
- CurDAG->init(*MF, *ORE, &MFP, LibInfo, UA, PSI, BFI, MMI, FnVarLocs,
- TTI->hasBranchDivergence(&Fn));
+ CurDAG->init(*MF, *ORE, &MFP, LibInfo, UA, PSI, BFI, MMI, FnVarLocs);
// Now get the optional analyzes if we want to.
// This is based on the possibly changed OptLevel (after optnone is taken
@@ -558,6 +556,10 @@ void SelectionDAGISel::initializeAnalysisResults(MachineFunctionPass &MFP) {
BatchAA = std::nullopt;
SP = &MFP.getAnalysis<StackProtector>().getLayoutInfo();
+
+#if !defined(NDEBUG) && LLVM_ENABLE_ABI_BREAKING_CHECKS
+ TTI = &MFP.getAnalysis<TargetTransformInfoWrapperPass>().getTTI(Fn);
+#endif
}
bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 9f525ea..d80a229 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -1008,7 +1008,7 @@ unsigned TargetLoweringBase::getBitWidthForCttzElements(
CR = CR.subtract(APInt(64, 1));
unsigned EltWidth = RetTy->getScalarSizeInBits();
- EltWidth = std::min(EltWidth, (unsigned)CR.getActiveBits());
+ EltWidth = std::min(EltWidth, CR.getActiveBits());
EltWidth = std::max(llvm::bit_ceil(EltWidth), (unsigned)8);
return EltWidth;
@@ -1772,7 +1772,7 @@ void llvm::GetReturnInfo(CallingConv::ID CC, Type *ReturnType,
Flags.setZExt();
for (unsigned i = 0; i < NumParts; ++i)
- Outs.push_back(ISD::OutputArg(Flags, PartVT, VT, /*isfixed=*/true, 0, 0));
+ Outs.push_back(ISD::OutputArg(Flags, PartVT, VT, 0, 0));
}
}
@@ -1893,6 +1893,7 @@ int TargetLoweringBase::InstructionOpcodeToISD(unsigned Opcode) const {
case SIToFP: return ISD::SINT_TO_FP;
case FPTrunc: return ISD::FP_ROUND;
case FPExt: return ISD::FP_EXTEND;
+ case PtrToAddr: return ISD::BITCAST;
case PtrToInt: return ISD::BITCAST;
case IntToPtr: return ISD::BITCAST;
case BitCast: return ISD::BITCAST;
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp b/llvm/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp
index 987e639..a201fae 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp
@@ -60,6 +60,20 @@ void DWARFGdbIndex::dumpSymbolTable(raw_ostream &OS) const {
", filled slots:",
SymbolTableOffset, (uint64_t)SymbolTable.size())
<< '\n';
+
+ const auto FindCuVectorId = [&](uint32_t VecOffset) {
+ // Entries in ConstantPoolVectors are sorted by their offset in constant
+ // pool, see how ConstantPoolVectors is populated in parseImpl.
+ const auto *It =
+ llvm::lower_bound(ConstantPoolVectors, VecOffset,
+ [](const auto &ConstantPoolEntry, uint32_t Offset) {
+ return ConstantPoolEntry.first < Offset;
+ });
+ assert(It != ConstantPoolVectors.end() && It->first == VecOffset &&
+ "Invalid symbol table");
+ return It - ConstantPoolVectors.begin();
+ };
+
uint32_t I = -1;
for (const SymTableEntry &E : SymbolTable) {
++I;
@@ -72,13 +86,7 @@ void DWARFGdbIndex::dumpSymbolTable(raw_ostream &OS) const {
StringRef Name = ConstantPoolStrings.substr(
ConstantPoolOffset - StringPoolOffset + E.NameOffset);
- auto CuVector = llvm::find_if(
- ConstantPoolVectors,
- [&](const std::pair<uint32_t, SmallVector<uint32_t, 0>> &V) {
- return V.first == E.VecOffset;
- });
- assert(CuVector != ConstantPoolVectors.end() && "Invalid symbol table");
- uint32_t CuVectorId = CuVector - ConstantPoolVectors.begin();
+ const uint32_t CuVectorId = FindCuVectorId(E.VecOffset);
OS << format(" String name: %s, CU vector index: %d\n", Name.data(),
CuVectorId);
}
diff --git a/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp b/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp
index 79904fc..574883e 100644
--- a/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp
+++ b/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp
@@ -92,16 +92,9 @@ static raw_ostream &operator<<(raw_ostream &OS,
return OS;
}
-static const EnumEntry<dxil::ResourceClass> ResourceClassNames[] = {
- {"CBV", dxil::ResourceClass::CBuffer},
- {"SRV", dxil::ResourceClass::SRV},
- {"UAV", dxil::ResourceClass::UAV},
- {"Sampler", dxil::ResourceClass::Sampler},
-};
-
static raw_ostream &operator<<(raw_ostream &OS, const ClauseType &Type) {
OS << enumToStringRef(dxil::ResourceClass(llvm::to_underlying(Type)),
- ArrayRef(ResourceClassNames));
+ dxbc::getResourceClasses());
return OS;
}
diff --git a/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp b/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp
index 9cf4ed1..1cda308 100644
--- a/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp
+++ b/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp
@@ -51,13 +51,6 @@ static std::optional<StringRef> extractMdStringValue(MDNode *Node,
return NodeText->getString();
}
-static const EnumEntry<dxil::ResourceClass> ResourceClassNames[] = {
- {"CBV", dxil::ResourceClass::CBuffer},
- {"SRV", dxil::ResourceClass::SRV},
- {"UAV", dxil::ResourceClass::UAV},
- {"Sampler", dxil::ResourceClass::Sampler},
-};
-
namespace {
// We use the OverloadVisit with std::visit to ensure the compiler catches if a
@@ -128,7 +121,7 @@ MDNode *MetadataBuilder::BuildRootDescriptor(const RootDescriptor &Descriptor) {
IRBuilder<> Builder(Ctx);
StringRef ResName =
enumToStringRef(dxil::ResourceClass(to_underlying(Descriptor.Type)),
- ArrayRef(ResourceClassNames));
+ dxbc::getResourceClasses());
assert(!ResName.empty() && "Provided an invalid Resource Class");
SmallString<7> Name({"Root", ResName});
Metadata *Operands[] = {
@@ -170,7 +163,7 @@ MDNode *MetadataBuilder::BuildDescriptorTableClause(
IRBuilder<> Builder(Ctx);
StringRef ResName =
enumToStringRef(dxil::ResourceClass(to_underlying(Clause.Type)),
- ArrayRef(ResourceClassNames));
+ dxbc::getResourceClasses());
assert(!ResName.empty() && "Provided an invalid Resource Class");
Metadata *Operands[] = {
MDString::get(Ctx, ResName),
diff --git a/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp b/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp
index 9d84aa8..72308a3d 100644
--- a/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp
+++ b/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp
@@ -29,7 +29,7 @@ bool verifyRegisterValue(uint32_t RegisterValue) {
// This Range is reserverved, therefore invalid, according to the spec
// https://github.com/llvm/wg-hlsl/blob/main/proposals/0002-root-signature-in-clang.md#all-the-values-should-be-legal
bool verifyRegisterSpace(uint32_t RegisterSpace) {
- return !(RegisterSpace >= 0xFFFFFFF0 && RegisterSpace <= 0xFFFFFFFF);
+ return !(RegisterSpace >= 0xFFFFFFF0);
}
bool verifyRootDescriptorFlag(uint32_t Version, uint32_t FlagsVal) {
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 3aa4f7a..ea027e4 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -4014,6 +4014,340 @@ OpenMPIRBuilder::createMasked(const LocationDescription &Loc,
/*Conditional*/ true, /*hasFinalize*/ true);
}
+static llvm::CallInst *emitNoUnwindRuntimeCall(IRBuilder<> &Builder,
+ llvm::FunctionCallee Callee,
+ ArrayRef<llvm::Value *> Args,
+ const llvm::Twine &Name) {
+ llvm::CallInst *Call = Builder.CreateCall(
+ Callee, Args, SmallVector<llvm::OperandBundleDef, 1>(), Name);
+ Call->setDoesNotThrow();
+ return Call;
+}
+
+// Expects input basic block is dominated by BeforeScanBB.
+// Once Scan directive is encountered, the code after scan directive should be
+// dominated by AfterScanBB. Scan directive splits the code sequence to
+// scan and input phase. Based on whether inclusive or exclusive
+// clause is used in the scan directive and whether input loop or scan loop
+// is lowered, it adds jumps to input and scan phase. First Scan loop is the
+// input loop and second is the scan loop. The code generated handles only
+// inclusive scans now.
+OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createScan(
+ const LocationDescription &Loc, InsertPointTy AllocaIP,
+ ArrayRef<llvm::Value *> ScanVars, ArrayRef<llvm::Type *> ScanVarsType,
+ bool IsInclusive, ScanInfo *ScanRedInfo) {
+ if (ScanRedInfo->OMPFirstScanLoop) {
+ llvm::Error Err = emitScanBasedDirectiveDeclsIR(AllocaIP, ScanVars,
+ ScanVarsType, ScanRedInfo);
+ if (Err)
+ return Err;
+ }
+ if (!updateToLocation(Loc))
+ return Loc.IP;
+
+ llvm::Value *IV = ScanRedInfo->IV;
+
+ if (ScanRedInfo->OMPFirstScanLoop) {
+ // Emit buffer[i] = red; at the end of the input phase.
+ for (size_t i = 0; i < ScanVars.size(); i++) {
+ Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
+ Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
+ Type *DestTy = ScanVarsType[i];
+ Value *Val = Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
+ Value *Src = Builder.CreateLoad(DestTy, ScanVars[i]);
+
+ Builder.CreateStore(Src, Val);
+ }
+ }
+ Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
+ emitBlock(ScanRedInfo->OMPScanDispatch,
+ Builder.GetInsertBlock()->getParent());
+
+ if (!ScanRedInfo->OMPFirstScanLoop) {
+ IV = ScanRedInfo->IV;
+ // Emit red = buffer[i]; at the entrance to the scan phase.
+ // TODO: if exclusive scan, the red = buffer[i-1] needs to be updated.
+ for (size_t i = 0; i < ScanVars.size(); i++) {
+ Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
+ Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
+ Type *DestTy = ScanVarsType[i];
+ Value *SrcPtr =
+ Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
+ Value *Src = Builder.CreateLoad(DestTy, SrcPtr);
+ Builder.CreateStore(Src, ScanVars[i]);
+ }
+ }
+
+ // TODO: Update it to CreateBr and remove dead blocks
+ llvm::Value *CmpI = Builder.getInt1(true);
+ if (ScanRedInfo->OMPFirstScanLoop == IsInclusive) {
+ Builder.CreateCondBr(CmpI, ScanRedInfo->OMPBeforeScanBlock,
+ ScanRedInfo->OMPAfterScanBlock);
+ } else {
+ Builder.CreateCondBr(CmpI, ScanRedInfo->OMPAfterScanBlock,
+ ScanRedInfo->OMPBeforeScanBlock);
+ }
+ emitBlock(ScanRedInfo->OMPAfterScanBlock,
+ Builder.GetInsertBlock()->getParent());
+ Builder.SetInsertPoint(ScanRedInfo->OMPAfterScanBlock);
+ return Builder.saveIP();
+}
+
+Error OpenMPIRBuilder::emitScanBasedDirectiveDeclsIR(
+ InsertPointTy AllocaIP, ArrayRef<Value *> ScanVars,
+ ArrayRef<Type *> ScanVarsType, ScanInfo *ScanRedInfo) {
+
+ Builder.restoreIP(AllocaIP);
+ // Create the shared pointer at alloca IP.
+ for (size_t i = 0; i < ScanVars.size(); i++) {
+ llvm::Value *BuffPtr =
+ Builder.CreateAlloca(Builder.getPtrTy(), nullptr, "vla");
+ (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]] = BuffPtr;
+ }
+
+ // Allocate temporary buffer by master thread
+ auto BodyGenCB = [&](InsertPointTy AllocaIP,
+ InsertPointTy CodeGenIP) -> Error {
+ Builder.restoreIP(CodeGenIP);
+ Value *AllocSpan =
+ Builder.CreateAdd(ScanRedInfo->Span, Builder.getInt32(1));
+ for (size_t i = 0; i < ScanVars.size(); i++) {
+ Type *IntPtrTy = Builder.getInt32Ty();
+ Constant *Allocsize = ConstantExpr::getSizeOf(ScanVarsType[i]);
+ Allocsize = ConstantExpr::getTruncOrBitCast(Allocsize, IntPtrTy);
+ Value *Buff = Builder.CreateMalloc(IntPtrTy, ScanVarsType[i], Allocsize,
+ AllocSpan, nullptr, "arr");
+ Builder.CreateStore(Buff, (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]]);
+ }
+ return Error::success();
+ };
+ // TODO: Perform finalization actions for variables. This has to be
+ // called for variables which have destructors/finalizers.
+ auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
+
+ Builder.SetInsertPoint(ScanRedInfo->OMPScanInit->getTerminator());
+ llvm::Value *FilterVal = Builder.getInt32(0);
+ llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
+ createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
+
+ if (!AfterIP)
+ return AfterIP.takeError();
+ Builder.restoreIP(*AfterIP);
+ BasicBlock *InputBB = Builder.GetInsertBlock();
+ if (InputBB->getTerminator())
+ Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
+ AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
+ if (!AfterIP)
+ return AfterIP.takeError();
+ Builder.restoreIP(*AfterIP);
+
+ return Error::success();
+}
+
+Error OpenMPIRBuilder::emitScanBasedDirectiveFinalsIR(
+ ArrayRef<ReductionInfo> ReductionInfos, ScanInfo *ScanRedInfo) {
+ auto BodyGenCB = [&](InsertPointTy AllocaIP,
+ InsertPointTy CodeGenIP) -> Error {
+ Builder.restoreIP(CodeGenIP);
+ for (ReductionInfo RedInfo : ReductionInfos) {
+ Value *PrivateVar = RedInfo.PrivateVariable;
+ Value *OrigVar = RedInfo.Variable;
+ Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[PrivateVar];
+ Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
+
+ Type *SrcTy = RedInfo.ElementType;
+ Value *Val = Builder.CreateInBoundsGEP(SrcTy, Buff, ScanRedInfo->Span,
+ "arrayOffset");
+ Value *Src = Builder.CreateLoad(SrcTy, Val);
+
+ Builder.CreateStore(Src, OrigVar);
+ Builder.CreateFree(Buff);
+ }
+ return Error::success();
+ };
+ // TODO: Perform finalization actions for variables. This has to be
+ // called for variables which have destructors/finalizers.
+ auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
+
+ if (ScanRedInfo->OMPScanFinish->getTerminator())
+ Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish->getTerminator());
+ else
+ Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish);
+
+ llvm::Value *FilterVal = Builder.getInt32(0);
+ llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
+ createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
+
+ if (!AfterIP)
+ return AfterIP.takeError();
+ Builder.restoreIP(*AfterIP);
+ BasicBlock *InputBB = Builder.GetInsertBlock();
+ if (InputBB->getTerminator())
+ Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
+ AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
+ if (!AfterIP)
+ return AfterIP.takeError();
+ Builder.restoreIP(*AfterIP);
+ return Error::success();
+}
+
+OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitScanReduction(
+ const LocationDescription &Loc,
+ ArrayRef<llvm::OpenMPIRBuilder::ReductionInfo> ReductionInfos,
+ ScanInfo *ScanRedInfo) {
+
+ if (!updateToLocation(Loc))
+ return Loc.IP;
+ auto BodyGenCB = [&](InsertPointTy AllocaIP,
+ InsertPointTy CodeGenIP) -> Error {
+ Builder.restoreIP(CodeGenIP);
+ Function *CurFn = Builder.GetInsertBlock()->getParent();
+ // for (int k = 0; k <= ceil(log2(n)); ++k)
+ llvm::BasicBlock *LoopBB =
+ BasicBlock::Create(CurFn->getContext(), "omp.outer.log.scan.body");
+ llvm::BasicBlock *ExitBB =
+ splitBB(Builder, false, "omp.outer.log.scan.exit");
+ llvm::Function *F = llvm::Intrinsic::getOrInsertDeclaration(
+ Builder.GetInsertBlock()->getModule(),
+ (llvm::Intrinsic::ID)llvm::Intrinsic::log2, Builder.getDoubleTy());
+ llvm::BasicBlock *InputBB = Builder.GetInsertBlock();
+ llvm::Value *Arg =
+ Builder.CreateUIToFP(ScanRedInfo->Span, Builder.getDoubleTy());
+ llvm::Value *LogVal = emitNoUnwindRuntimeCall(Builder, F, Arg, "");
+ F = llvm::Intrinsic::getOrInsertDeclaration(
+ Builder.GetInsertBlock()->getModule(),
+ (llvm::Intrinsic::ID)llvm::Intrinsic::ceil, Builder.getDoubleTy());
+ LogVal = emitNoUnwindRuntimeCall(Builder, F, LogVal, "");
+ LogVal = Builder.CreateFPToUI(LogVal, Builder.getInt32Ty());
+ llvm::Value *NMin1 = Builder.CreateNUWSub(
+ ScanRedInfo->Span,
+ llvm::ConstantInt::get(ScanRedInfo->Span->getType(), 1));
+ Builder.SetInsertPoint(InputBB);
+ Builder.CreateBr(LoopBB);
+ emitBlock(LoopBB, CurFn);
+ Builder.SetInsertPoint(LoopBB);
+
+ PHINode *Counter = Builder.CreatePHI(Builder.getInt32Ty(), 2);
+ // size pow2k = 1;
+ PHINode *Pow2K = Builder.CreatePHI(Builder.getInt32Ty(), 2);
+ Counter->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 0),
+ InputBB);
+ Pow2K->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 1),
+ InputBB);
+ // for (size i = n - 1; i >= 2 ^ k; --i)
+ // tmp[i] op= tmp[i-pow2k];
+ llvm::BasicBlock *InnerLoopBB =
+ BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.body");
+ llvm::BasicBlock *InnerExitBB =
+ BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.exit");
+ llvm::Value *CmpI = Builder.CreateICmpUGE(NMin1, Pow2K);
+ Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
+ emitBlock(InnerLoopBB, CurFn);
+ Builder.SetInsertPoint(InnerLoopBB);
+ PHINode *IVal = Builder.CreatePHI(Builder.getInt32Ty(), 2);
+ IVal->addIncoming(NMin1, LoopBB);
+ for (ReductionInfo RedInfo : ReductionInfos) {
+ Value *ReductionVal = RedInfo.PrivateVariable;
+ Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ReductionVal];
+ Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
+ Type *DestTy = RedInfo.ElementType;
+ Value *IV = Builder.CreateAdd(IVal, Builder.getInt32(1));
+ Value *LHSPtr =
+ Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
+ Value *OffsetIval = Builder.CreateNUWSub(IV, Pow2K);
+ Value *RHSPtr =
+ Builder.CreateInBoundsGEP(DestTy, Buff, OffsetIval, "arrayOffset");
+ Value *LHS = Builder.CreateLoad(DestTy, LHSPtr);
+ Value *RHS = Builder.CreateLoad(DestTy, RHSPtr);
+ llvm::Value *Result;
+ InsertPointOrErrorTy AfterIP =
+ RedInfo.ReductionGen(Builder.saveIP(), LHS, RHS, Result);
+ if (!AfterIP)
+ return AfterIP.takeError();
+ Builder.CreateStore(Result, LHSPtr);
+ }
+ llvm::Value *NextIVal = Builder.CreateNUWSub(
+ IVal, llvm::ConstantInt::get(Builder.getInt32Ty(), 1));
+ IVal->addIncoming(NextIVal, Builder.GetInsertBlock());
+ CmpI = Builder.CreateICmpUGE(NextIVal, Pow2K);
+ Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
+ emitBlock(InnerExitBB, CurFn);
+ llvm::Value *Next = Builder.CreateNUWAdd(
+ Counter, llvm::ConstantInt::get(Counter->getType(), 1));
+ Counter->addIncoming(Next, Builder.GetInsertBlock());
+ // pow2k <<= 1;
+ llvm::Value *NextPow2K = Builder.CreateShl(Pow2K, 1, "", /*HasNUW=*/true);
+ Pow2K->addIncoming(NextPow2K, Builder.GetInsertBlock());
+ llvm::Value *Cmp = Builder.CreateICmpNE(Next, LogVal);
+ Builder.CreateCondBr(Cmp, LoopBB, ExitBB);
+ Builder.SetInsertPoint(ExitBB->getFirstInsertionPt());
+ return Error::success();
+ };
+
+ // TODO: Perform finalization actions for variables. This has to be
+ // called for variables which have destructors/finalizers.
+ auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
+
+ llvm::Value *FilterVal = Builder.getInt32(0);
+ llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
+ createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
+
+ if (!AfterIP)
+ return AfterIP.takeError();
+ Builder.restoreIP(*AfterIP);
+ AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
+
+ if (!AfterIP)
+ return AfterIP.takeError();
+ Builder.restoreIP(*AfterIP);
+ Error Err = emitScanBasedDirectiveFinalsIR(ReductionInfos, ScanRedInfo);
+ if (Err)
+ return Err;
+
+ return AfterIP;
+}
+
+Error OpenMPIRBuilder::emitScanBasedDirectiveIR(
+ llvm::function_ref<Error()> InputLoopGen,
+ llvm::function_ref<Error(LocationDescription Loc)> ScanLoopGen,
+ ScanInfo *ScanRedInfo) {
+
+ {
+ // Emit loop with input phase:
+ // for (i: 0..<num_iters>) {
+ // <input phase>;
+ // buffer[i] = red;
+ // }
+ ScanRedInfo->OMPFirstScanLoop = true;
+ Error Err = InputLoopGen();
+ if (Err)
+ return Err;
+ }
+ {
+ // Emit loop with scan phase:
+ // for (i: 0..<num_iters>) {
+ // red = buffer[i];
+ // <scan phase>;
+ // }
+ ScanRedInfo->OMPFirstScanLoop = false;
+ Error Err = ScanLoopGen(Builder.saveIP());
+ if (Err)
+ return Err;
+ }
+ return Error::success();
+}
+
+void OpenMPIRBuilder::createScanBBs(ScanInfo *ScanRedInfo) {
+ Function *Fun = Builder.GetInsertBlock()->getParent();
+ ScanRedInfo->OMPScanDispatch =
+ BasicBlock::Create(Fun->getContext(), "omp.inscan.dispatch");
+ ScanRedInfo->OMPAfterScanBlock =
+ BasicBlock::Create(Fun->getContext(), "omp.after.scan.bb");
+ ScanRedInfo->OMPBeforeScanBlock =
+ BasicBlock::Create(Fun->getContext(), "omp.before.scan.bb");
+ ScanRedInfo->OMPScanLoopExit =
+ BasicBlock::Create(Fun->getContext(), "omp.scan.loop.exit");
+}
CanonicalLoopInfo *OpenMPIRBuilder::createLoopSkeleton(
DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
BasicBlock *PostInsertBefore, const Twine &Name) {
@@ -4111,6 +4445,76 @@ OpenMPIRBuilder::createCanonicalLoop(const LocationDescription &Loc,
return CL;
}
+Expected<ScanInfo *> OpenMPIRBuilder::scanInfoInitialize() {
+ ScanInfos.emplace_front();
+ ScanInfo *Result = &ScanInfos.front();
+ return Result;
+}
+
+Expected<SmallVector<llvm::CanonicalLoopInfo *>>
+OpenMPIRBuilder::createCanonicalScanLoops(
+ const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
+ Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
+ InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo) {
+ LocationDescription ComputeLoc =
+ ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
+ updateToLocation(ComputeLoc);
+
+ SmallVector<CanonicalLoopInfo *> Result;
+
+ Value *TripCount = calculateCanonicalLoopTripCount(
+ ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
+ ScanRedInfo->Span = TripCount;
+ ScanRedInfo->OMPScanInit = splitBB(Builder, true, "scan.init");
+ Builder.SetInsertPoint(ScanRedInfo->OMPScanInit);
+
+ auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
+ Builder.restoreIP(CodeGenIP);
+ ScanRedInfo->IV = IV;
+ createScanBBs(ScanRedInfo);
+ BasicBlock *InputBlock = Builder.GetInsertBlock();
+ Instruction *Terminator = InputBlock->getTerminator();
+ assert(Terminator->getNumSuccessors() == 1);
+ BasicBlock *ContinueBlock = Terminator->getSuccessor(0);
+ Terminator->setSuccessor(0, ScanRedInfo->OMPScanDispatch);
+ emitBlock(ScanRedInfo->OMPBeforeScanBlock,
+ Builder.GetInsertBlock()->getParent());
+ Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
+ emitBlock(ScanRedInfo->OMPScanLoopExit,
+ Builder.GetInsertBlock()->getParent());
+ Builder.CreateBr(ContinueBlock);
+ Builder.SetInsertPoint(
+ ScanRedInfo->OMPBeforeScanBlock->getFirstInsertionPt());
+ return BodyGenCB(Builder.saveIP(), IV);
+ };
+
+ const auto &&InputLoopGen = [&]() -> Error {
+ Expected<CanonicalLoopInfo *> LoopInfo = createCanonicalLoop(
+ Builder.saveIP(), BodyGen, Start, Stop, Step, IsSigned, InclusiveStop,
+ ComputeIP, Name, true, ScanRedInfo);
+ if (!LoopInfo)
+ return LoopInfo.takeError();
+ Result.push_back(*LoopInfo);
+ Builder.restoreIP((*LoopInfo)->getAfterIP());
+ return Error::success();
+ };
+ const auto &&ScanLoopGen = [&](LocationDescription Loc) -> Error {
+ Expected<CanonicalLoopInfo *> LoopInfo =
+ createCanonicalLoop(Loc, BodyGen, Start, Stop, Step, IsSigned,
+ InclusiveStop, ComputeIP, Name, true, ScanRedInfo);
+ if (!LoopInfo)
+ return LoopInfo.takeError();
+ Result.push_back(*LoopInfo);
+ Builder.restoreIP((*LoopInfo)->getAfterIP());
+ ScanRedInfo->OMPScanFinish = Builder.GetInsertBlock();
+ return Error::success();
+ };
+ Error Err = emitScanBasedDirectiveIR(InputLoopGen, ScanLoopGen, ScanRedInfo);
+ if (Err)
+ return Err;
+ return Result;
+}
+
Value *OpenMPIRBuilder::calculateCanonicalLoopTripCount(
const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step,
bool IsSigned, bool InclusiveStop, const Twine &Name) {
@@ -4174,7 +4578,8 @@ Value *OpenMPIRBuilder::calculateCanonicalLoopTripCount(
Expected<CanonicalLoopInfo *> OpenMPIRBuilder::createCanonicalLoop(
const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
- InsertPointTy ComputeIP, const Twine &Name) {
+ InsertPointTy ComputeIP, const Twine &Name, bool InScan,
+ ScanInfo *ScanRedInfo) {
LocationDescription ComputeLoc =
ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
@@ -4185,6 +4590,8 @@ Expected<CanonicalLoopInfo *> OpenMPIRBuilder::createCanonicalLoop(
Builder.restoreIP(CodeGenIP);
Value *Span = Builder.CreateMul(IV, Step);
Value *IndVar = Builder.CreateAdd(Span, Start);
+ if (InScan)
+ ScanRedInfo->IV = IndVar;
return BodyGenCB(Builder.saveIP(), IndVar);
};
LocationDescription LoopLoc =
@@ -8956,7 +9363,8 @@ OpenMPIRBuilder::createAtomicWrite(const LocationDescription &Loc,
OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicUpdate(
const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
- AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr) {
+ AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr,
+ bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous");
if (!updateToLocation(Loc))
return Loc.IP;
@@ -8974,9 +9382,9 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicUpdate(
"OpenMP atomic does not support LT or GT operations");
});
- Expected<std::pair<Value *, Value *>> AtomicResult =
- emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp,
- X.IsVolatile, IsXBinopExpr);
+ Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
+ AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp, X.IsVolatile,
+ IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
if (!AtomicResult)
return AtomicResult.takeError();
checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update);
@@ -9023,7 +9431,8 @@ Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2,
Expected<std::pair<Value *, Value *>> OpenMPIRBuilder::emitAtomicUpdate(
InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
- AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr) {
+ AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr,
+ bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
// TODO: handle the case where XElemTy is not byte-sized or not a power of 2
// or a complex datatype.
bool emitRMWOp = false;
@@ -9046,7 +9455,20 @@ Expected<std::pair<Value *, Value *>> OpenMPIRBuilder::emitAtomicUpdate(
std::pair<Value *, Value *> Res;
if (emitRMWOp) {
- Res.first = Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
+ AtomicRMWInst *RMWInst =
+ Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
+ if (T.isAMDGPU()) {
+ if (IsIgnoreDenormalMode)
+ RMWInst->setMetadata("amdgpu.ignore.denormal.mode",
+ llvm::MDNode::get(Builder.getContext(), {}));
+ if (!IsFineGrainedMemory)
+ RMWInst->setMetadata("amdgpu.no.fine.grained.memory",
+ llvm::MDNode::get(Builder.getContext(), {}));
+ if (!IsRemoteMemory)
+ RMWInst->setMetadata("amdgpu.no.remote.memory",
+ llvm::MDNode::get(Builder.getContext(), {}));
+ }
+ Res.first = RMWInst;
// not needed except in case of postfix captures. Generate anyway for
// consistency with the else part. Will be removed with any DCE pass.
// AtomicRMWInst::Xchg does not have a coressponding instruction.
@@ -9178,7 +9600,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicCapture(
const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp,
- bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr) {
+ bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr,
+ bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
if (!updateToLocation(Loc))
return Loc.IP;
@@ -9197,9 +9620,9 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicCapture(
// If UpdateExpr is 'x' updated with some `expr` not based on 'x',
// 'x' is simply atomically rewritten with 'expr'.
AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg);
- Expected<std::pair<Value *, Value *>> AtomicResult =
- emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp,
- X.IsVolatile, IsXBinopExpr);
+ Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
+ AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp, X.IsVolatile,
+ IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
if (!AtomicResult)
return AtomicResult.takeError();
Value *CapturedVal =
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 7159107..b91fd70 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1311,14 +1311,15 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
}
break;
case 'l':
- if (Name.starts_with("lifetime.start") ||
- Name.starts_with("lifetime.end")) {
- // Unless remangling is required, do not upgrade the function declaration,
- // but do upgrade the calls.
- if (auto Result = llvm::Intrinsic::remangleIntrinsicFunction(F))
- NewFn = *Result;
- else
- NewFn = F;
+ if ((Name.starts_with("lifetime.start") ||
+ Name.starts_with("lifetime.end")) &&
+ F->arg_size() == 2) {
+ Intrinsic::ID IID = Name.starts_with("lifetime.start")
+ ? Intrinsic::lifetime_start
+ : Intrinsic::lifetime_end;
+ rename(F);
+ NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID,
+ F->getArg(0)->getType());
return true;
}
break;
@@ -5133,21 +5134,20 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
case Intrinsic::lifetime_start:
case Intrinsic::lifetime_end: {
- Value *Size = CI->getArgOperand(0);
- Value *Ptr = CI->getArgOperand(1);
- if (isa<AllocaInst>(Ptr)) {
+ if (CI->arg_size() != 2) {
DefaultCase();
return;
}
+ Value *Ptr = CI->getArgOperand(1);
// Try to strip pointer casts, such that the lifetime works on an alloca.
Ptr = Ptr->stripPointerCasts();
if (isa<AllocaInst>(Ptr)) {
// Don't use NewFn, as we might have looked through an addrspacecast.
if (NewFn->getIntrinsicID() == Intrinsic::lifetime_start)
- NewCall = Builder.CreateLifetimeStart(Ptr, cast<ConstantInt>(Size));
+ NewCall = Builder.CreateLifetimeStart(Ptr);
else
- NewCall = Builder.CreateLifetimeEnd(Ptr, cast<ConstantInt>(Size));
+ NewCall = Builder.CreateLifetimeEnd(Ptr);
break;
}
diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp
index d4ad21e..6b202ba 100644
--- a/llvm/lib/IR/ConstantFold.cpp
+++ b/llvm/lib/IR/ConstantFold.cpp
@@ -254,6 +254,7 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
return FoldBitCast(V, DestTy);
case Instruction::AddrSpaceCast:
case Instruction::IntToPtr:
+ case Instruction::PtrToAddr:
case Instruction::PtrToInt:
return nullptr;
}
diff --git a/llvm/lib/IR/ConstantRange.cpp b/llvm/lib/IR/ConstantRange.cpp
index e09c139..2fcdbcc6 100644
--- a/llvm/lib/IR/ConstantRange.cpp
+++ b/llvm/lib/IR/ConstantRange.cpp
@@ -829,6 +829,7 @@ ConstantRange ConstantRange::castOp(Instruction::CastOps CastOp,
case Instruction::FPTrunc:
case Instruction::FPExt:
case Instruction::IntToPtr:
+ case Instruction::PtrToAddr:
case Instruction::PtrToInt:
case Instruction::AddrSpaceCast:
// Conservatively return getFull set.
diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp
index a3c725b..c7e3113a 100644
--- a/llvm/lib/IR/Constants.cpp
+++ b/llvm/lib/IR/Constants.cpp
@@ -1567,6 +1567,7 @@ Constant *ConstantExpr::getWithOperands(ArrayRef<Constant *> Ops, Type *Ty,
case Instruction::SIToFP:
case Instruction::FPToUI:
case Instruction::FPToSI:
+ case Instruction::PtrToAddr:
case Instruction::PtrToInt:
case Instruction::IntToPtr:
case Instruction::BitCast:
@@ -2223,6 +2224,8 @@ Constant *ConstantExpr::getCast(unsigned oc, Constant *C, Type *Ty,
llvm_unreachable("Invalid cast opcode");
case Instruction::Trunc:
return getTrunc(C, Ty, OnlyIfReduced);
+ case Instruction::PtrToAddr:
+ return getPtrToAddr(C, Ty, OnlyIfReduced);
case Instruction::PtrToInt:
return getPtrToInt(C, Ty, OnlyIfReduced);
case Instruction::IntToPtr:
@@ -2280,6 +2283,20 @@ Constant *ConstantExpr::getTrunc(Constant *C, Type *Ty, bool OnlyIfReduced) {
return getFoldedCast(Instruction::Trunc, C, Ty, OnlyIfReduced);
}
+Constant *ConstantExpr::getPtrToAddr(Constant *C, Type *DstTy,
+ bool OnlyIfReduced) {
+ assert(C->getType()->isPtrOrPtrVectorTy() &&
+ "PtrToAddr source must be pointer or pointer vector");
+ assert(DstTy->isIntOrIntVectorTy() &&
+ "PtrToAddr destination must be integer or integer vector");
+ assert(isa<VectorType>(C->getType()) == isa<VectorType>(DstTy));
+ if (isa<VectorType>(C->getType()))
+ assert(cast<VectorType>(C->getType())->getElementCount() ==
+ cast<VectorType>(DstTy)->getElementCount() &&
+ "Invalid cast between a different number of vector elements");
+ return getFoldedCast(Instruction::PtrToAddr, C, DstTy, OnlyIfReduced);
+}
+
Constant *ConstantExpr::getPtrToInt(Constant *C, Type *DstTy,
bool OnlyIfReduced) {
assert(C->getType()->isPtrOrPtrVectorTy() &&
@@ -2435,6 +2452,7 @@ bool ConstantExpr::isDesirableCastOp(unsigned Opcode) {
case Instruction::FPToSI:
return false;
case Instruction::Trunc:
+ case Instruction::PtrToAddr:
case Instruction::PtrToInt:
case Instruction::IntToPtr:
case Instruction::BitCast:
@@ -2457,6 +2475,7 @@ bool ConstantExpr::isSupportedCastOp(unsigned Opcode) {
case Instruction::FPToSI:
return false;
case Instruction::Trunc:
+ case Instruction::PtrToAddr:
case Instruction::PtrToInt:
case Instruction::IntToPtr:
case Instruction::BitCast:
@@ -3401,6 +3420,7 @@ Instruction *ConstantExpr::getAsInstruction() const {
switch (getOpcode()) {
case Instruction::Trunc:
+ case Instruction::PtrToAddr:
case Instruction::PtrToInt:
case Instruction::IntToPtr:
case Instruction::BitCast:
diff --git a/llvm/lib/IR/DebugInfoMetadata.cpp b/llvm/lib/IR/DebugInfoMetadata.cpp
index f1d4549..96065ed 100644
--- a/llvm/lib/IR/DebugInfoMetadata.cpp
+++ b/llvm/lib/IR/DebugInfoMetadata.cpp
@@ -57,15 +57,9 @@ DebugVariable::DebugVariable(const DbgVariableRecord *DVR)
DILocation::DILocation(LLVMContext &C, StorageType Storage, unsigned Line,
unsigned Column, uint64_t AtomGroup, uint8_t AtomRank,
ArrayRef<Metadata *> MDs, bool ImplicitCode)
- : MDNode(C, DILocationKind, Storage, MDs)
-#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS
- ,
- AtomGroup(AtomGroup), AtomRank(AtomRank)
-#endif
-{
-#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS
+ : MDNode(C, DILocationKind, Storage, MDs), AtomGroup(AtomGroup),
+ AtomRank(AtomRank) {
assert(AtomRank <= 7 && "AtomRank number should fit in 3 bits");
-#endif
if (AtomGroup)
C.updateDILocationAtomGroupWaterline(AtomGroup + 1);
diff --git a/llvm/lib/IR/Globals.cpp b/llvm/lib/IR/Globals.cpp
index 7b799c7..11d33e2 100644
--- a/llvm/lib/IR/Globals.cpp
+++ b/llvm/lib/IR/Globals.cpp
@@ -404,6 +404,7 @@ findBaseObject(const Constant *C, DenseSet<const GlobalAlias *> &Aliases,
return findBaseObject(CE->getOperand(0), Aliases, Op);
}
case Instruction::IntToPtr:
+ case Instruction::PtrToAddr:
case Instruction::PtrToInt:
case Instruction::BitCast:
case Instruction::GetElementPtr:
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index 49c6dc7..614c3a9 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -411,28 +411,16 @@ CallInst *IRBuilderBase::CreateFPMinimumReduce(Value *Src) {
return getReductionIntrinsic(Intrinsic::vector_reduce_fminimum, Src);
}
-CallInst *IRBuilderBase::CreateLifetimeStart(Value *Ptr, ConstantInt *Size) {
+CallInst *IRBuilderBase::CreateLifetimeStart(Value *Ptr) {
assert(isa<PointerType>(Ptr->getType()) &&
"lifetime.start only applies to pointers.");
- if (!Size)
- Size = getInt64(-1);
- else
- assert(Size->getType() == getInt64Ty() &&
- "lifetime.start requires the size to be an i64");
- Value *Ops[] = { Size, Ptr };
- return CreateIntrinsic(Intrinsic::lifetime_start, {Ptr->getType()}, Ops);
+ return CreateIntrinsic(Intrinsic::lifetime_start, {Ptr->getType()}, {Ptr});
}
-CallInst *IRBuilderBase::CreateLifetimeEnd(Value *Ptr, ConstantInt *Size) {
+CallInst *IRBuilderBase::CreateLifetimeEnd(Value *Ptr) {
assert(isa<PointerType>(Ptr->getType()) &&
"lifetime.end only applies to pointers.");
- if (!Size)
- Size = getInt64(-1);
- else
- assert(Size->getType() == getInt64Ty() &&
- "lifetime.end requires the size to be an i64");
- Value *Ops[] = { Size, Ptr };
- return CreateIntrinsic(Intrinsic::lifetime_end, {Ptr->getType()}, Ops);
+ return CreateIntrinsic(Intrinsic::lifetime_end, {Ptr->getType()}, {Ptr});
}
CallInst *IRBuilderBase::CreateInvariantStart(Value *Ptr, ConstantInt *Size) {
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index b7cd12a..4540268 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -817,6 +817,7 @@ const char *Instruction::getOpcodeName(unsigned OpCode) {
case UIToFP: return "uitofp";
case SIToFP: return "sitofp";
case IntToPtr: return "inttoptr";
+ case PtrToAddr: return "ptrtoaddr";
case PtrToInt: return "ptrtoint";
case BitCast: return "bitcast";
case AddrSpaceCast: return "addrspacecast";
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index b896382..a1751c0 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -2798,6 +2798,7 @@ bool CastInst::isNoopCast(Instruction::CastOps Opcode,
return false;
case Instruction::BitCast:
return true; // BitCast never modifies bits.
+ case Instruction::PtrToAddr:
case Instruction::PtrToInt:
return DL.getIntPtrType(SrcTy)->getScalarSizeInBits() ==
DestTy->getScalarSizeInBits();
@@ -2855,26 +2856,29 @@ unsigned CastInst::isEliminableCastPair(
// same reason.
const unsigned numCastOps =
Instruction::CastOpsEnd - Instruction::CastOpsBegin;
+ // clang-format off
static const uint8_t CastResults[numCastOps][numCastOps] = {
- // T F F U S F F P I B A -+
- // R Z S P P I I T P 2 N T S |
- // U E E 2 2 2 2 R E I T C C +- secondOp
- // N X X U S F F N X N 2 V V |
- // C T T I I P P C T T P T T -+
- { 1, 0, 0,99,99, 0, 0,99,99,99, 0, 3, 0}, // Trunc -+
- { 8, 1, 9,99,99, 2,17,99,99,99, 2, 3, 0}, // ZExt |
- { 8, 0, 1,99,99, 0, 2,99,99,99, 0, 3, 0}, // SExt |
- { 0, 0, 0,99,99, 0, 0,99,99,99, 0, 3, 0}, // FPToUI |
- { 0, 0, 0,99,99, 0, 0,99,99,99, 0, 3, 0}, // FPToSI |
- { 99,99,99, 0, 0,99,99, 0, 0,99,99, 4, 0}, // UIToFP +- firstOp
- { 99,99,99, 0, 0,99,99, 0, 0,99,99, 4, 0}, // SIToFP |
- { 99,99,99, 0, 0,99,99, 0, 0,99,99, 4, 0}, // FPTrunc |
- { 99,99,99, 2, 2,99,99, 8, 2,99,99, 4, 0}, // FPExt |
- { 1, 0, 0,99,99, 0, 0,99,99,99, 7, 3, 0}, // PtrToInt |
- { 99,99,99,99,99,99,99,99,99,11,99,15, 0}, // IntToPtr |
- { 5, 5, 5, 0, 0, 5, 5, 0, 0,16, 5, 1,14}, // BitCast |
- { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,13,12}, // AddrSpaceCast -+
+ // T F F U S F F P P I B A -+
+ // R Z S P P I I T P 2 2 N T S |
+ // U E E 2 2 2 2 R E I A T C C +- secondOp
+ // N X X U S F F N X N D 2 V V |
+ // C T T I I P P C T T R P T T -+
+ { 1, 0, 0,99,99, 0, 0,99,99,99,99, 0, 3, 0}, // Trunc -+
+ { 8, 1, 9,99,99, 2,17,99,99,99,99, 2, 3, 0}, // ZExt |
+ { 8, 0, 1,99,99, 0, 2,99,99,99,99, 0, 3, 0}, // SExt |
+ { 0, 0, 0,99,99, 0, 0,99,99,99,99, 0, 3, 0}, // FPToUI |
+ { 0, 0, 0,99,99, 0, 0,99,99,99,99, 0, 3, 0}, // FPToSI |
+ { 99,99,99, 0, 0,99,99, 0, 0,99,99,99, 4, 0}, // UIToFP +- firstOp
+ { 99,99,99, 0, 0,99,99, 0, 0,99,99,99, 4, 0}, // SIToFP |
+ { 99,99,99, 0, 0,99,99, 0, 0,99,99,99, 4, 0}, // FPTrunc |
+ { 99,99,99, 2, 2,99,99, 8, 2,99,99,99, 4, 0}, // FPExt |
+ { 1, 0, 0,99,99, 0, 0,99,99,99,99, 7, 3, 0}, // PtrToInt |
+ { 1, 0, 0,99,99, 0, 0,99,99,99,99, 0, 3, 0}, // PtrToAddr |
+ { 99,99,99,99,99,99,99,99,99,11,99,99,15, 0}, // IntToPtr |
+ { 5, 5, 5, 0, 0, 5, 5, 0, 0,16,16, 5, 1,14}, // BitCast |
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,13,12}, // AddrSpaceCast -+
};
+ // clang-format on
// TODO: This logic could be encoded into the table above and handled in the
// switch below.
@@ -3046,6 +3050,7 @@ CastInst *CastInst::Create(Instruction::CastOps op, Value *S, Type *Ty,
case SIToFP: return new SIToFPInst (S, Ty, Name, InsertBefore);
case FPToUI: return new FPToUIInst (S, Ty, Name, InsertBefore);
case FPToSI: return new FPToSIInst (S, Ty, Name, InsertBefore);
+ case PtrToAddr: return new PtrToAddrInst (S, Ty, Name, InsertBefore);
case PtrToInt: return new PtrToIntInst (S, Ty, Name, InsertBefore);
case IntToPtr: return new IntToPtrInst (S, Ty, Name, InsertBefore);
case BitCast:
@@ -3347,6 +3352,7 @@ CastInst::castIsValid(Instruction::CastOps op, Type *SrcTy, Type *DstTy) {
case Instruction::FPToSI:
return SrcTy->isFPOrFPVectorTy() && DstTy->isIntOrIntVectorTy() &&
SrcEC == DstEC;
+ case Instruction::PtrToAddr:
case Instruction::PtrToInt:
if (SrcEC != DstEC)
return false;
@@ -3460,6 +3466,12 @@ PtrToIntInst::PtrToIntInst(Value *S, Type *Ty, const Twine &Name,
assert(castIsValid(getOpcode(), S, Ty) && "Illegal PtrToInt");
}
+PtrToAddrInst::PtrToAddrInst(Value *S, Type *Ty, const Twine &Name,
+ InsertPosition InsertBefore)
+ : CastInst(Ty, PtrToAddr, S, Name, InsertBefore) {
+ assert(castIsValid(getOpcode(), S, Ty) && "Illegal PtrToAddr");
+}
+
IntToPtrInst::IntToPtrInst(Value *S, Type *Ty, const Twine &Name,
InsertPosition InsertBefore)
: CastInst(Ty, IntToPtr, S, Name, InsertBefore) {
@@ -4427,6 +4439,10 @@ PtrToIntInst *PtrToIntInst::cloneImpl() const {
return new PtrToIntInst(getOperand(0), getType());
}
+PtrToAddrInst *PtrToAddrInst::cloneImpl() const {
+ return new PtrToAddrInst(getOperand(0), getType());
+}
+
IntToPtrInst *IntToPtrInst::cloneImpl() const {
return new IntToPtrInst(getOperand(0), getType());
}
diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h
index aa2a60e..e03f993 100644
--- a/llvm/lib/IR/LLVMContextImpl.h
+++ b/llvm/lib/IR/LLVMContextImpl.h
@@ -312,10 +312,8 @@ template <> struct MDNodeKeyImpl<MDTuple> : MDNodeOpsKey {
template <> struct MDNodeKeyImpl<DILocation> {
Metadata *Scope;
Metadata *InlinedAt;
-#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS
uint64_t AtomGroup : 61;
uint64_t AtomRank : 3;
-#endif
unsigned Line;
uint16_t Column;
bool ImplicitCode;
@@ -323,36 +321,24 @@ template <> struct MDNodeKeyImpl<DILocation> {
MDNodeKeyImpl(unsigned Line, uint16_t Column, Metadata *Scope,
Metadata *InlinedAt, bool ImplicitCode, uint64_t AtomGroup,
uint8_t AtomRank)
- : Scope(Scope), InlinedAt(InlinedAt),
-#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS
- AtomGroup(AtomGroup), AtomRank(AtomRank),
-#endif
- Line(Line), Column(Column), ImplicitCode(ImplicitCode) {
- }
+ : Scope(Scope), InlinedAt(InlinedAt), AtomGroup(AtomGroup),
+ AtomRank(AtomRank), Line(Line), Column(Column),
+ ImplicitCode(ImplicitCode) {}
MDNodeKeyImpl(const DILocation *L)
: Scope(L->getRawScope()), InlinedAt(L->getRawInlinedAt()),
-#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS
AtomGroup(L->getAtomGroup()), AtomRank(L->getAtomRank()),
-#endif
Line(L->getLine()), Column(L->getColumn()),
- ImplicitCode(L->isImplicitCode()) {
- }
+ ImplicitCode(L->isImplicitCode()) {}
bool isKeyOf(const DILocation *RHS) const {
return Line == RHS->getLine() && Column == RHS->getColumn() &&
Scope == RHS->getRawScope() && InlinedAt == RHS->getRawInlinedAt() &&
- ImplicitCode == RHS->isImplicitCode()
-#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS
- && AtomGroup == RHS->getAtomGroup() &&
- AtomRank == RHS->getAtomRank();
-#else
- ;
-#endif
+ ImplicitCode == RHS->isImplicitCode() &&
+ AtomGroup == RHS->getAtomGroup() && AtomRank == RHS->getAtomRank();
}
unsigned getHashValue() const {
-#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS
// Hashing AtomGroup and AtomRank substantially impacts performance whether
// Key Instructions is enabled or not. We can't detect whether it's enabled
// here cheaply; avoiding hashing zero values is a good approximation. This
@@ -363,7 +349,6 @@ template <> struct MDNodeKeyImpl<DILocation> {
if (AtomGroup || AtomRank)
return hash_combine(Line, Column, Scope, InlinedAt, ImplicitCode,
AtomGroup, (uint8_t)AtomRank);
-#endif
return hash_combine(Line, Column, Scope, InlinedAt, ImplicitCode);
}
};
diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp
index 129ca4a..5928c89 100644
--- a/llvm/lib/IR/Value.cpp
+++ b/llvm/lib/IR/Value.cpp
@@ -747,34 +747,28 @@ const Value *Value::stripAndAccumulateConstantOffsets(
// means when we construct GEPOffset, we need to use the size
// of GEP's pointer type rather than the size of the original
// pointer type.
- unsigned CurBitWidth = DL.getIndexTypeSizeInBits(V->getType());
- if (CurBitWidth == BitWidth) {
- if (!GEP->accumulateConstantOffset(DL, Offset, ExternalAnalysis))
- return V;
- } else {
- APInt GEPOffset(CurBitWidth, 0);
- if (!GEP->accumulateConstantOffset(DL, GEPOffset, ExternalAnalysis))
- return V;
+ APInt GEPOffset(DL.getIndexTypeSizeInBits(V->getType()), 0);
+ if (!GEP->accumulateConstantOffset(DL, GEPOffset, ExternalAnalysis))
+ return V;
- // Stop traversal if the pointer offset wouldn't fit in the bit-width
- // provided by the Offset argument. This can happen due to AddrSpaceCast
- // stripping.
- if (GEPOffset.getSignificantBits() > BitWidth)
- return V;
+ // Stop traversal if the pointer offset wouldn't fit in the bit-width
+ // provided by the Offset argument. This can happen due to AddrSpaceCast
+ // stripping.
+ if (GEPOffset.getSignificantBits() > BitWidth)
+ return V;
- // External Analysis can return a result higher/lower than the value
- // represents. We need to detect overflow/underflow.
- APInt GEPOffsetST = GEPOffset.sextOrTrunc(BitWidth);
- if (!ExternalAnalysis) {
- Offset += GEPOffsetST;
- } else {
- bool Overflow = false;
- APInt OldOffset = Offset;
- Offset = Offset.sadd_ov(GEPOffsetST, Overflow);
- if (Overflow) {
- Offset = OldOffset;
- return V;
- }
+ // External Analysis can return a result higher/lower than the value
+ // represents. We need to detect overflow/underflow.
+ APInt GEPOffsetST = GEPOffset.sextOrTrunc(BitWidth);
+ if (!ExternalAnalysis) {
+ Offset += GEPOffsetST;
+ } else {
+ bool Overflow = false;
+ APInt OldOffset = Offset;
+ Offset = Offset.sadd_ov(GEPOffsetST, Overflow);
+ if (Overflow) {
+ Offset = OldOffset;
+ return V;
}
}
V = GEP->getPointerOperand();
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index ca3f148..1d3c379 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -566,6 +566,8 @@ private:
void visitUIToFPInst(UIToFPInst &I);
void visitSIToFPInst(SIToFPInst &I);
void visitIntToPtrInst(IntToPtrInst &I);
+ void checkPtrToAddr(Type *SrcTy, Type *DestTy, const Value &V);
+ void visitPtrToAddrInst(PtrToAddrInst &I);
void visitPtrToIntInst(PtrToIntInst &I);
void visitBitCastInst(BitCastInst &I);
void visitAddrSpaceCastInst(AddrSpaceCastInst &I);
@@ -834,6 +836,7 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) {
&GV);
Check(GV.getInitializer()->getType()->isSized(),
"Global variable initializer must be sized", &GV);
+ visitConstantExprsRecursively(GV.getInitializer());
// If the global has common linkage, it must have a zero initializer and
// cannot be constant.
if (GV.hasCommonLinkage()) {
@@ -2610,6 +2613,8 @@ void Verifier::visitConstantExpr(const ConstantExpr *CE) {
Check(CastInst::castIsValid(Instruction::BitCast, CE->getOperand(0),
CE->getType()),
"Invalid bitcast", CE);
+ else if (CE->getOpcode() == Instruction::PtrToAddr)
+ checkPtrToAddr(CE->getOperand(0)->getType(), CE->getType(), *CE);
}
void Verifier::visitConstantPtrAuth(const ConstantPtrAuth *CPA) {
@@ -3532,6 +3537,28 @@ void Verifier::visitFPToSIInst(FPToSIInst &I) {
visitInstruction(I);
}
+void Verifier::checkPtrToAddr(Type *SrcTy, Type *DestTy, const Value &V) {
+ Check(SrcTy->isPtrOrPtrVectorTy(), "PtrToAddr source must be pointer", V);
+ Check(DestTy->isIntOrIntVectorTy(), "PtrToAddr result must be integral", V);
+ Check(SrcTy->isVectorTy() == DestTy->isVectorTy(), "PtrToAddr type mismatch",
+ V);
+
+ if (SrcTy->isVectorTy()) {
+ auto *VSrc = cast<VectorType>(SrcTy);
+ auto *VDest = cast<VectorType>(DestTy);
+ Check(VSrc->getElementCount() == VDest->getElementCount(),
+ "PtrToAddr vector length mismatch", V);
+ }
+
+ Type *AddrTy = DL.getAddressType(SrcTy);
+ Check(AddrTy == DestTy, "PtrToAddr result must be address width", V);
+}
+
+void Verifier::visitPtrToAddrInst(PtrToAddrInst &I) {
+ checkPtrToAddr(I.getOperand(0)->getType(), I.getType(), I);
+ visitInstruction(I);
+}
+
void Verifier::visitPtrToIntInst(PtrToIntInst &I) {
// Get the source and destination types
Type *SrcTy = I.getOperand(0)->getType();
@@ -3547,7 +3574,7 @@ void Verifier::visitPtrToIntInst(PtrToIntInst &I) {
auto *VSrc = cast<VectorType>(SrcTy);
auto *VDest = cast<VectorType>(DestTy);
Check(VSrc->getElementCount() == VDest->getElementCount(),
- "PtrToInt Vector width mismatch", &I);
+ "PtrToInt Vector length mismatch", &I);
}
visitInstruction(I);
@@ -3567,7 +3594,7 @@ void Verifier::visitIntToPtrInst(IntToPtrInst &I) {
auto *VSrc = cast<VectorType>(SrcTy);
auto *VDest = cast<VectorType>(DestTy);
Check(VSrc->getElementCount() == VDest->getElementCount(),
- "IntToPtr Vector width mismatch", &I);
+ "IntToPtr Vector length mismatch", &I);
}
visitInstruction(I);
}
@@ -6770,7 +6797,7 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
}
case Intrinsic::lifetime_start:
case Intrinsic::lifetime_end: {
- Value *Ptr = Call.getArgOperand(1);
+ Value *Ptr = Call.getArgOperand(0);
Check(isa<AllocaInst>(Ptr) || isa<PoisonValue>(Ptr),
"llvm.lifetime.start/end can only be used on alloca or poison",
&Call);
diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp
index 8c27958..d0c6144 100644
--- a/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/llvm/lib/MC/MCObjectStreamer.cpp
@@ -443,7 +443,7 @@ void MCObjectStreamer::emitInstToData(const MCInst &Inst,
// MCAssembler::relaxAlign.
auto *Sec = F->getParent();
if (!Sec->isLinkerRelaxable())
- Sec->setLinkerRelaxable();
+ Sec->setFirstLinkerRelaxable(F->getLayoutOrder());
// Do not add data after a linker-relaxable instruction. The difference
// between a new label and a label at or before the linker-relaxable
// instruction cannot be resolved at assemble-time.
diff --git a/llvm/lib/MC/MCSection.cpp b/llvm/lib/MC/MCSection.cpp
index 27ca131..9ed6fd1 100644
--- a/llvm/lib/MC/MCSection.cpp
+++ b/llvm/lib/MC/MCSection.cpp
@@ -20,7 +20,7 @@ using namespace llvm;
MCSection::MCSection(StringRef Name, bool IsText, bool IsBss, MCSymbol *Begin)
: Begin(Begin), HasInstructions(false), IsRegistered(false), IsText(IsText),
- IsBss(IsBss), LinkerRelaxable(false), Name(Name) {
+ IsBss(IsBss), Name(Name) {
DummyFragment.setParent(this);
}
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index 7ca26aa..df807fc 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -331,61 +331,34 @@ void InstrProfWriter::addDataAccessProfData(
DataAccessProfileData = std::move(DataAccessProfDataIn);
}
-void InstrProfWriter::addTemporalProfileTrace(TemporalProfTraceTy Trace) {
- assert(Trace.FunctionNameRefs.size() <= MaxTemporalProfTraceLength);
- assert(!Trace.FunctionNameRefs.empty());
- if (TemporalProfTraceStreamSize < TemporalProfTraceReservoirSize) {
- // Simply append the trace if we have not yet hit our reservoir size limit.
- TemporalProfTraces.push_back(std::move(Trace));
- } else {
- // Otherwise, replace a random trace in the stream.
- std::uniform_int_distribution<uint64_t> Distribution(
- 0, TemporalProfTraceStreamSize);
- uint64_t RandomIndex = Distribution(RNG);
- if (RandomIndex < TemporalProfTraces.size())
- TemporalProfTraces[RandomIndex] = std::move(Trace);
- }
- ++TemporalProfTraceStreamSize;
-}
-
void InstrProfWriter::addTemporalProfileTraces(
SmallVectorImpl<TemporalProfTraceTy> &SrcTraces, uint64_t SrcStreamSize) {
+ if (TemporalProfTraces.size() > TemporalProfTraceReservoirSize)
+ TemporalProfTraces.truncate(TemporalProfTraceReservoirSize);
for (auto &Trace : SrcTraces)
if (Trace.FunctionNameRefs.size() > MaxTemporalProfTraceLength)
Trace.FunctionNameRefs.resize(MaxTemporalProfTraceLength);
llvm::erase_if(SrcTraces, [](auto &T) { return T.FunctionNameRefs.empty(); });
- // Assume that the source has the same reservoir size as the destination to
- // avoid needing to record it in the indexed profile format.
- bool IsDestSampled =
- (TemporalProfTraceStreamSize > TemporalProfTraceReservoirSize);
- bool IsSrcSampled = (SrcStreamSize > TemporalProfTraceReservoirSize);
- if (!IsDestSampled && IsSrcSampled) {
- // If one of the traces are sampled, ensure that it belongs to Dest.
- std::swap(TemporalProfTraces, SrcTraces);
- std::swap(TemporalProfTraceStreamSize, SrcStreamSize);
- std::swap(IsDestSampled, IsSrcSampled);
- }
- if (!IsSrcSampled) {
- // If the source stream is not sampled, we add each source trace normally.
- for (auto &Trace : SrcTraces)
- addTemporalProfileTrace(std::move(Trace));
+ // If there are no source traces, it is probably because
+ // --temporal-profile-max-trace-length=0 was set to deliberately remove all
+ // traces. In that case, we do not want to increase the stream size
+ if (SrcTraces.empty())
return;
- }
- // Otherwise, we find the traces that would have been removed if we added
- // the whole source stream.
- SmallSetVector<uint64_t, 8> IndicesToReplace;
- for (uint64_t I = 0; I < SrcStreamSize; I++) {
- std::uniform_int_distribution<uint64_t> Distribution(
- 0, TemporalProfTraceStreamSize);
+ // Add traces until our reservoir is full or we run out of source traces
+ auto SrcTraceIt = SrcTraces.begin();
+ while (TemporalProfTraces.size() < TemporalProfTraceReservoirSize &&
+ SrcTraceIt < SrcTraces.end())
+ TemporalProfTraces.push_back(*SrcTraceIt++);
+ // Our reservoir is full, we need to sample the source stream
+ llvm::shuffle(SrcTraceIt, SrcTraces.end(), RNG);
+ for (uint64_t I = TemporalProfTraces.size();
+ I < SrcStreamSize && SrcTraceIt < SrcTraces.end(); I++) {
+ std::uniform_int_distribution<uint64_t> Distribution(0, I);
uint64_t RandomIndex = Distribution(RNG);
if (RandomIndex < TemporalProfTraces.size())
- IndicesToReplace.insert(RandomIndex);
- ++TemporalProfTraceStreamSize;
+ TemporalProfTraces[RandomIndex] = *SrcTraceIt++;
}
- // Then we insert a random sample of the source traces.
- llvm::shuffle(SrcTraces.begin(), SrcTraces.end(), RNG);
- for (const auto &[Index, Trace] : llvm::zip(IndicesToReplace, SrcTraces))
- TemporalProfTraces[Index] = std::move(Trace);
+ TemporalProfTraceStreamSize += SrcStreamSize;
}
void InstrProfWriter::mergeRecordsFromWriter(InstrProfWriter &&IPW,
diff --git a/llvm/lib/SandboxIR/Context.cpp b/llvm/lib/SandboxIR/Context.cpp
index fe34037..70ac68a 100644
--- a/llvm/lib/SandboxIR/Context.cpp
+++ b/llvm/lib/SandboxIR/Context.cpp
@@ -256,6 +256,7 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) {
case llvm::Instruction::FPToUI:
case llvm::Instruction::FPToSI:
case llvm::Instruction::FPExt:
+ case llvm::Instruction::PtrToAddr:
case llvm::Instruction::PtrToInt:
case llvm::Instruction::IntToPtr:
case llvm::Instruction::SIToFP:
diff --git a/llvm/lib/SandboxIR/Instruction.cpp b/llvm/lib/SandboxIR/Instruction.cpp
index 956047c..1a81d18 100644
--- a/llvm/lib/SandboxIR/Instruction.cpp
+++ b/llvm/lib/SandboxIR/Instruction.cpp
@@ -1007,6 +1007,9 @@ static llvm::Instruction::CastOps getLLVMCastOp(Instruction::Opcode Opc) {
return static_cast<llvm::Instruction::CastOps>(llvm::Instruction::FPToSI);
case Instruction::Opcode::FPExt:
return static_cast<llvm::Instruction::CastOps>(llvm::Instruction::FPExt);
+ case Instruction::Opcode::PtrToAddr:
+ return static_cast<llvm::Instruction::CastOps>(
+ llvm::Instruction::PtrToAddr);
case Instruction::Opcode::PtrToInt:
return static_cast<llvm::Instruction::CastOps>(llvm::Instruction::PtrToInt);
case Instruction::Opcode::IntToPtr:
diff --git a/llvm/lib/Support/StringRef.cpp b/llvm/lib/Support/StringRef.cpp
index dc75878..b6a2f8a 100644
--- a/llvm/lib/Support/StringRef.cpp
+++ b/llvm/lib/Support/StringRef.cpp
@@ -385,7 +385,7 @@ size_t StringRef::count(StringRef Str) const {
return Count;
}
-static unsigned GetAutoSenseRadix(StringRef &Str) {
+unsigned llvm::getAutoSenseRadix(StringRef &Str) {
if (Str.empty())
return 10;
@@ -410,7 +410,7 @@ bool llvm::consumeUnsignedInteger(StringRef &Str, unsigned Radix,
unsigned long long &Result) {
// Autosense radix if not specified.
if (Radix == 0)
- Radix = GetAutoSenseRadix(Str);
+ Radix = getAutoSenseRadix(Str);
// Empty strings (after the radix autosense) are invalid.
if (Str.empty()) return true;
@@ -509,7 +509,7 @@ bool StringRef::consumeInteger(unsigned Radix, APInt &Result) {
// Autosense radix if not specified.
if (Radix == 0)
- Radix = GetAutoSenseRadix(Str);
+ Radix = getAutoSenseRadix(Str);
assert(Radix > 1 && Radix <= 36);
diff --git a/llvm/lib/Support/regcomp.c b/llvm/lib/Support/regcomp.c
index 4ed5982..f5c4778 100644
--- a/llvm/lib/Support/regcomp.c
+++ b/llvm/lib/Support/regcomp.c
@@ -305,7 +305,7 @@ llvm_regcomp(llvm_regex_t *preg, const char *pattern, int cflags) {
return (REG_INVARG);
len = preg->re_endp - pattern;
} else {
- len = strlen((const char *)pattern);
+ len = strlen(pattern);
}
/* do the mallocs early so failure handling is easy */
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 201bfe0..d6a3d59 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -1236,14 +1236,20 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
.add(MI.getOperand(3));
transferImpOps(MI, I, I);
} else {
+ unsigned RegState =
+ getRenamableRegState(MI.getOperand(1).isRenamable()) |
+ getKillRegState(
+ MI.getOperand(1).isKill() &&
+ MI.getOperand(1).getReg() != MI.getOperand(2).getReg() &&
+ MI.getOperand(1).getReg() != MI.getOperand(3).getReg());
BuildMI(MBB, MBBI, MI.getDebugLoc(),
TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::ORRv8i8
: AArch64::ORRv16i8))
.addReg(DstReg,
RegState::Define |
getRenamableRegState(MI.getOperand(0).isRenamable()))
- .add(MI.getOperand(1))
- .add(MI.getOperand(1));
+ .addReg(MI.getOperand(1).getReg(), RegState)
+ .addReg(MI.getOperand(1).getReg(), RegState);
auto I2 =
BuildMI(MBB, MBBI, MI.getDebugLoc(),
TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 1f5ff4e..3c06c6a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -8537,7 +8537,7 @@ static void analyzeCallOperands(const AArch64TargetLowering &TLI,
if (IsCalleeWin64) {
UseVarArgCC = true;
} else {
- UseVarArgCC = !Outs[i].IsFixed;
+ UseVarArgCC = ArgFlags.isVarArg();
}
}
@@ -8982,7 +8982,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
unsigned NumArgs = Outs.size();
for (unsigned i = 0; i != NumArgs; ++i) {
- if (!Outs[i].IsFixed && Outs[i].VT.isScalableVector())
+ if (Outs[i].Flags.isVarArg() && Outs[i].VT.isScalableVector())
report_fatal_error("Passing SVE types to variadic functions is "
"currently not supported");
}
@@ -14742,6 +14742,106 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
return ResultSLI;
}
+static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+ const AArch64TargetLowering &TLI) {
+ EVT VT = N->getValueType(0);
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc DL(N);
+ const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
+
+ if (!VT.isVector())
+ return SDValue();
+
+ if (VT.isScalableVector() && !Subtarget.hasSVE2())
+ return SDValue();
+
+ if (VT.isFixedLengthVector() &&
+ (!Subtarget.isNeonAvailable() || TLI.useSVEForFixedLengthVectorVT(VT)))
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ if (N0.getOpcode() != ISD::AND)
+ return SDValue();
+
+ SDValue N1 = N->getOperand(1);
+ if (N1.getOpcode() != ISD::AND)
+ return SDValue();
+
+ // InstCombine does (not (neg a)) => (add a -1).
+ // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
+ // Loop over all combinations of AND operands.
+ for (int i = 1; i >= 0; --i) {
+ for (int j = 1; j >= 0; --j) {
+ SDValue O0 = N0->getOperand(i);
+ SDValue O1 = N1->getOperand(j);
+ SDValue Sub, Add, SubSibling, AddSibling;
+
+ // Find a SUB and an ADD operand, one from each AND.
+ if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
+ Sub = O0;
+ Add = O1;
+ SubSibling = N0->getOperand(1 - i);
+ AddSibling = N1->getOperand(1 - j);
+ } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
+ Add = O0;
+ Sub = O1;
+ AddSibling = N0->getOperand(1 - i);
+ SubSibling = N1->getOperand(1 - j);
+ } else
+ continue;
+
+ if (!ISD::isConstantSplatVectorAllZeros(Sub.getOperand(0).getNode()))
+ continue;
+
+ // Constant ones is always righthand operand of the Add.
+ if (!ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode()))
+ continue;
+
+ if (Sub.getOperand(1) != Add.getOperand(0))
+ continue;
+
+ return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
+ }
+ }
+
+ // (or (and a b) (and (not a) c)) => (bsl a b c)
+ // We only have to look for constant vectors here since the general, variable
+ // case can be handled in TableGen.
+ unsigned Bits = VT.getScalarSizeInBits();
+ uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
+ for (int i = 1; i >= 0; --i)
+ for (int j = 1; j >= 0; --j) {
+ APInt Val1, Val2;
+
+ if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) &&
+ ISD::isConstantSplatVector(N1->getOperand(j).getNode(), Val2) &&
+ (BitMask & ~Val1.getZExtValue()) == Val2.getZExtValue()) {
+ return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
+ N0->getOperand(1 - i), N1->getOperand(1 - j));
+ }
+ BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
+ BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
+ if (!BVN0 || !BVN1)
+ continue;
+
+ bool FoundMatch = true;
+ for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
+ ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
+ ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
+ if (!CN0 || !CN1 ||
+ CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
+ FoundMatch = false;
+ break;
+ }
+ }
+ if (FoundMatch)
+ return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
+ N0->getOperand(1 - i), N1->getOperand(1 - j));
+ }
+
+ return SDValue();
+}
+
SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
SelectionDAG &DAG) const {
if (useSVEForFixedLengthVectorVT(Op.getValueType(),
@@ -19419,106 +19519,6 @@ static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
return FixConv;
}
-static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
- const AArch64TargetLowering &TLI) {
- EVT VT = N->getValueType(0);
- SelectionDAG &DAG = DCI.DAG;
- SDLoc DL(N);
- const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
-
- if (!VT.isVector())
- return SDValue();
-
- if (VT.isScalableVector() && !Subtarget.hasSVE2())
- return SDValue();
-
- if (VT.isFixedLengthVector() &&
- (!Subtarget.isNeonAvailable() || TLI.useSVEForFixedLengthVectorVT(VT)))
- return SDValue();
-
- SDValue N0 = N->getOperand(0);
- if (N0.getOpcode() != ISD::AND)
- return SDValue();
-
- SDValue N1 = N->getOperand(1);
- if (N1.getOpcode() != ISD::AND)
- return SDValue();
-
- // InstCombine does (not (neg a)) => (add a -1).
- // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
- // Loop over all combinations of AND operands.
- for (int i = 1; i >= 0; --i) {
- for (int j = 1; j >= 0; --j) {
- SDValue O0 = N0->getOperand(i);
- SDValue O1 = N1->getOperand(j);
- SDValue Sub, Add, SubSibling, AddSibling;
-
- // Find a SUB and an ADD operand, one from each AND.
- if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
- Sub = O0;
- Add = O1;
- SubSibling = N0->getOperand(1 - i);
- AddSibling = N1->getOperand(1 - j);
- } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
- Add = O0;
- Sub = O1;
- AddSibling = N0->getOperand(1 - i);
- SubSibling = N1->getOperand(1 - j);
- } else
- continue;
-
- if (!ISD::isConstantSplatVectorAllZeros(Sub.getOperand(0).getNode()))
- continue;
-
- // Constant ones is always righthand operand of the Add.
- if (!ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode()))
- continue;
-
- if (Sub.getOperand(1) != Add.getOperand(0))
- continue;
-
- return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
- }
- }
-
- // (or (and a b) (and (not a) c)) => (bsl a b c)
- // We only have to look for constant vectors here since the general, variable
- // case can be handled in TableGen.
- unsigned Bits = VT.getScalarSizeInBits();
- uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
- for (int i = 1; i >= 0; --i)
- for (int j = 1; j >= 0; --j) {
- APInt Val1, Val2;
-
- if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) &&
- ISD::isConstantSplatVector(N1->getOperand(j).getNode(), Val2) &&
- (BitMask & ~Val1.getZExtValue()) == Val2.getZExtValue()) {
- return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
- N0->getOperand(1 - i), N1->getOperand(1 - j));
- }
- BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
- BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
- if (!BVN0 || !BVN1)
- continue;
-
- bool FoundMatch = true;
- for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
- ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
- ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
- if (!CN0 || !CN1 ||
- CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
- FoundMatch = false;
- break;
- }
- }
- if (FoundMatch)
- return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
- N0->getOperand(1 - i), N1->getOperand(1 - j));
- }
-
- return SDValue();
-}
-
// Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
// convert to csel(ccmp(.., cc0)), depending on cc1:
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index d068a12..b033f88 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -7362,7 +7362,9 @@ multiclass SIMDDifferentThreeVectorBD<bit U, bits<4> opc, string asm,
[(set (v8i16 V128:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
def v16i8 : BaseSIMDDifferentThreeVector<U, 0b001, opc,
V128, V128, V128,
- asm#"2", ".8h", ".16b", ".16b", []>;
+ asm#"2", ".8h", ".16b", ".16b",
+ [(set (v8i16 V128:$Rd), (OpNode (v8i8 (extract_high_v16i8 (v16i8 V128:$Rn))),
+ (v8i8 (extract_high_v16i8 (v16i8 V128:$Rm)))))]>;
let Predicates = [HasAES] in {
def v1i64 : BaseSIMDDifferentThreeVector<U, 0b110, opc,
V128, V64, V64,
@@ -7374,10 +7376,6 @@ multiclass SIMDDifferentThreeVectorBD<bit U, bits<4> opc, string asm,
[(set (v16i8 V128:$Rd), (OpNode (extract_high_v2i64 (v2i64 V128:$Rn)),
(extract_high_v2i64 (v2i64 V128:$Rm))))]>;
}
-
- def : Pat<(v8i16 (OpNode (v8i8 (extract_high_v16i8 (v16i8 V128:$Rn))),
- (v8i8 (extract_high_v16i8 (v16i8 V128:$Rm))))),
- (!cast<Instruction>(NAME#"v16i8") V128:$Rn, V128:$Rm)>;
}
multiclass SIMDLongThreeVectorHS<bit U, bits<4> opc, string asm,
@@ -7402,6 +7400,7 @@ multiclass SIMDLongThreeVectorHS<bit U, bits<4> opc, string asm,
(extract_high_v4i32 (v4i32 V128:$Rm))))]>;
}
+let isCommutable = 1 in
multiclass SIMDLongThreeVectorBHSabdl<bit U, bits<4> opc, string asm,
SDPatternOperator OpNode = null_frag> {
def v8i8_v8i16 : BaseSIMDDifferentThreeVector<U, 0b000, opc,
@@ -7483,6 +7482,7 @@ multiclass SIMDLongThreeVectorTiedBHSabal<bit U, bits<4> opc,
(extract_high_v4i32 (v4i32 V128:$Rm)))))))]>;
}
+let isCommutable = 1 in
multiclass SIMDLongThreeVectorBHS<bit U, bits<4> opc, string asm,
SDPatternOperator OpNode = null_frag> {
def v8i8_v8i16 : BaseSIMDDifferentThreeVector<U, 0b000, opc,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index ac31236..8cfbff9 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6055,6 +6055,7 @@ defm MLA : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla", null_frag>;
defm MLS : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls", null_frag>;
defm MUL : SIMDThreeSameVectorBHS<0, 0b10011, "mul", mul>;
+let isCommutable = 1 in
defm PMUL : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_aarch64_neon_pmul>;
defm SABA : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba",
TriOpFrag<(add node:$LHS, (abds node:$MHS, node:$RHS))> >;
@@ -6806,6 +6807,7 @@ defm ADDHN : SIMDNarrowThreeVectorBHS<0,0b0100,"addhn", int_aarch64_neon_addhn>
defm SUBHN : SIMDNarrowThreeVectorBHS<0,0b0110,"subhn", int_aarch64_neon_subhn>;
defm RADDHN : SIMDNarrowThreeVectorBHS<1,0b0100,"raddhn",int_aarch64_neon_raddhn>;
defm RSUBHN : SIMDNarrowThreeVectorBHS<1,0b0110,"rsubhn",int_aarch64_neon_rsubhn>;
+let isCommutable = 1 in
defm PMULL : SIMDDifferentThreeVectorBD<0,0b1110,"pmull", AArch64pmull>;
defm SABAL : SIMDLongThreeVectorTiedBHSabal<0,0b0101,"sabal", abds>;
defm SABDL : SIMDLongThreeVectorBHSabdl<0, 0b0111, "sabdl", abds>;
@@ -6822,6 +6824,7 @@ defm SQDMLAL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1001, "sqdmlal", saddsat>;
defm SQDMLSL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1011, "sqdmlsl", ssubsat>;
defm SQDMULL : SIMDLongThreeVectorHS<0, 0b1101, "sqdmull",
int_aarch64_neon_sqdmull>;
+let isCommutable = 0 in
defm SSUBL : SIMDLongThreeVectorBHS<0, 0b0010, "ssubl",
BinOpFrag<(sub (sext node:$LHS), (sext node:$RHS))>>;
defm SSUBW : SIMDWideThreeVectorBHS<0, 0b0011, "ssubw",
@@ -6836,6 +6839,7 @@ defm UMLAL : SIMDLongThreeVectorTiedBHS<1, 0b1000, "umlal",
defm UMLSL : SIMDLongThreeVectorTiedBHS<1, 0b1010, "umlsl",
TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>;
defm UMULL : SIMDLongThreeVectorBHS<1, 0b1100, "umull", AArch64umull>;
+let isCommutable = 0 in
defm USUBL : SIMDLongThreeVectorBHS<1, 0b0010, "usubl",
BinOpFrag<(sub (zanyext node:$LHS), (zanyext node:$RHS))>>;
defm USUBW : SIMDWideThreeVectorBHS< 1, 0b0011, "usubw",
diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
index b97d622..fd4ef2a 100644
--- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
@@ -8,8 +8,8 @@
//
// This pass performs below peephole optimizations on MIR level.
//
-// 1. MOVi32imm + ANDS?Wrr ==> ANDWri + ANDS?Wri
-// MOVi64imm + ANDS?Xrr ==> ANDXri + ANDS?Xri
+// 1. MOVi32imm + (ANDS?|EOR|ORR)Wrr ==> (AND|EOR|ORR)Wri + (ANDS?|EOR|ORR)Wri
+// MOVi64imm + (ANDS?|EOR|ORR)Xrr ==> (AND|EOR|ORR)Xri + (ANDS?|EOR|ORR)Xri
//
// 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi
// MOVi64imm + ADDXrr ==> ADDXri + ADDXri
@@ -128,6 +128,7 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass {
// Strategy used to split logical immediate bitmasks.
enum class SplitStrategy {
Intersect,
+ Disjoint,
};
template <typename T>
bool trySplitLogicalImm(unsigned Opc, MachineInstr &MI,
@@ -163,6 +164,7 @@ INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt",
template <typename T>
static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
T UImm = static_cast<T>(Imm);
+ assert(UImm && (UImm != ~static_cast<T>(0)) && "Invalid immediate!");
// The bitmask immediate consists of consecutive ones. Let's say there is
// constant 0b00000000001000000000010000000000 which does not consist of
@@ -191,18 +193,47 @@ static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
}
template <typename T>
+static bool splitDisjointBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc,
+ T &Imm2Enc) {
+ assert(Imm && (Imm != ~static_cast<T>(0)) && "Invalid immediate!");
+
+ // Try to split a bitmask of the form 0b00000000011000000000011110000000 into
+ // two disjoint masks such as 0b00000000011000000000000000000000 and
+ // 0b00000000000000000000011110000000 where the inclusive/exclusive OR of the
+ // new masks match the original mask.
+ unsigned LowestBitSet = llvm::countr_zero(Imm);
+ unsigned LowestGapBitUnset =
+ LowestBitSet + llvm::countr_one(Imm >> LowestBitSet);
+
+ // Create a mask for the least significant group of consecutive ones.
+ assert(LowestGapBitUnset < sizeof(T) * CHAR_BIT && "Undefined behaviour!");
+ T NewImm1 = (static_cast<T>(1) << LowestGapBitUnset) -
+ (static_cast<T>(1) << LowestBitSet);
+ // Create a disjoint mask for the remaining ones.
+ T NewImm2 = Imm & ~NewImm1;
+
+ // Do not split if NewImm2 is not a valid bitmask immediate.
+ if (!AArch64_AM::isLogicalImmediate(NewImm2, RegSize))
+ return false;
+
+ Imm1Enc = AArch64_AM::encodeLogicalImmediate(NewImm1, RegSize);
+ Imm2Enc = AArch64_AM::encodeLogicalImmediate(NewImm2, RegSize);
+ return true;
+}
+
+template <typename T>
bool AArch64MIPeepholeOpt::trySplitLogicalImm(unsigned Opc, MachineInstr &MI,
SplitStrategy Strategy,
unsigned OtherOpc) {
- // Try below transformation.
+ // Try below transformations.
//
- // MOVi32imm + ANDS?Wrr ==> ANDWri + ANDS?Wri
- // MOVi64imm + ANDS?Xrr ==> ANDXri + ANDS?Xri
+ // MOVi32imm + (ANDS?|EOR|ORR)Wrr ==> (AND|EOR|ORR)Wri + (ANDS?|EOR|ORR)Wri
+ // MOVi64imm + (ANDS?|EOR|ORR)Xrr ==> (AND|EOR|ORR)Xri + (ANDS?|EOR|ORR)Xri
//
// The mov pseudo instruction could be expanded to multiple mov instructions
// later. Let's try to split the constant operand of mov instruction into two
- // bitmask immediates. It makes only two AND instructions instead of multiple
- // mov + and instructions.
+ // bitmask immediates based on the given split strategy. It makes only two
+ // logical instructions instead of multiple mov + logic instructions.
return splitTwoPartImm<T>(
MI,
@@ -224,6 +255,9 @@ bool AArch64MIPeepholeOpt::trySplitLogicalImm(unsigned Opc, MachineInstr &MI,
case SplitStrategy::Intersect:
SplitSucc = splitBitmaskImm(Imm, RegSize, Imm0, Imm1);
break;
+ case SplitStrategy::Disjoint:
+ SplitSucc = splitDisjointBitmaskImm(Imm, RegSize, Imm0, Imm1);
+ break;
}
if (SplitSucc)
return std::make_pair(Opc, !OtherOpc ? Opc : OtherOpc);
@@ -889,6 +923,22 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
Changed |= trySplitLogicalImm<uint64_t>(
AArch64::ANDXri, MI, SplitStrategy::Intersect, AArch64::ANDSXri);
break;
+ case AArch64::EORWrr:
+ Changed |= trySplitLogicalImm<uint32_t>(AArch64::EORWri, MI,
+ SplitStrategy::Disjoint);
+ break;
+ case AArch64::EORXrr:
+ Changed |= trySplitLogicalImm<uint64_t>(AArch64::EORXri, MI,
+ SplitStrategy::Disjoint);
+ break;
+ case AArch64::ORRWrr:
+ Changed |= trySplitLogicalImm<uint32_t>(AArch64::ORRWri, MI,
+ SplitStrategy::Disjoint);
+ break;
+ case AArch64::ORRXrr:
+ Changed |= trySplitLogicalImm<uint64_t>(AArch64::ORRXri, MI,
+ SplitStrategy::Disjoint);
+ break;
case AArch64::ORRWrs:
Changed |= visitORR(MI);
break;
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index adc984a..1bc1d98 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -22,7 +22,8 @@ def TuneA320 : SubtargetFeature<"a320", "ARMProcFamily", "CortexA320",
FeatureFuseAES,
FeatureFuseAdrpAdd,
FeaturePostRAScheduler,
- FeatureUseWzrToVecMove]>;
+ FeatureUseWzrToVecMove,
+ FeatureUseFixedOverScalableIfEqualCost]>;
def TuneA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
"Cortex-A53 ARM processors", [
@@ -45,7 +46,8 @@ def TuneA510 : SubtargetFeature<"a510", "ARMProcFamily", "CortexA510",
FeatureFuseAES,
FeatureFuseAdrpAdd,
FeaturePostRAScheduler,
- FeatureUseWzrToVecMove
+ FeatureUseWzrToVecMove,
+ FeatureUseFixedOverScalableIfEqualCost
]>;
def TuneA520 : SubtargetFeature<"a520", "ARMProcFamily", "CortexA520",
@@ -53,7 +55,8 @@ def TuneA520 : SubtargetFeature<"a520", "ARMProcFamily", "CortexA520",
FeatureFuseAES,
FeatureFuseAdrpAdd,
FeaturePostRAScheduler,
- FeatureUseWzrToVecMove]>;
+ FeatureUseWzrToVecMove,
+ FeatureUseFixedOverScalableIfEqualCost]>;
def TuneA520AE : SubtargetFeature<"a520ae", "ARMProcFamily", "CortexA520",
"Cortex-A520AE ARM processors", [
@@ -756,7 +759,6 @@ def ProcessorFeatures {
FeatureSB, FeaturePAuth, FeatureSSBS, FeatureSVE, FeatureSVE2,
FeatureComplxNum, FeatureCRC, FeatureDotProd,
FeatureFPARMv8,FeatureFullFP16, FeatureJS, FeatureLSE,
- FeatureUseFixedOverScalableIfEqualCost,
FeatureRAS, FeatureRCPC, FeatureRDM, FeatureFPAC];
list<SubtargetFeature> A520 = [HasV9_2aOps, FeaturePerfMon, FeatureAM,
FeatureMTE, FeatureETE, FeatureSVEBitPerm,
@@ -766,7 +768,6 @@ def ProcessorFeatures {
FeatureSVE, FeatureSVE2, FeatureBF16, FeatureComplxNum, FeatureCRC,
FeatureFPARMv8, FeatureFullFP16, FeatureMatMulInt8, FeatureJS,
FeatureNEON, FeatureLSE, FeatureRAS, FeatureRCPC, FeatureRDM,
- FeatureUseFixedOverScalableIfEqualCost,
FeatureDotProd, FeatureFPAC];
list<SubtargetFeature> A520AE = [HasV9_2aOps, FeaturePerfMon, FeatureAM,
FeatureMTE, FeatureETE, FeatureSVEBitPerm,
diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
index f136a184..a67bd42 100644
--- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
@@ -585,8 +585,7 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
ClMaxLifetimes);
if (StandardLifetime) {
IntrinsicInst *Start = Info.LifetimeStart[0];
- uint64_t Size =
- cast<ConstantInt>(Start->getArgOperand(0))->getZExtValue();
+ uint64_t Size = *Info.AI->getAllocationSize(*DL);
Size = alignTo(Size, kTagGranuleSize);
tagAlloca(AI, Start->getNextNode(), TagPCall, Size);
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 9f05add..5c94aeb 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -554,7 +554,17 @@ static bool isUnpackedVectorVT(EVT VecVT) {
VecVT.getSizeInBits().getKnownMinValue() < AArch64::SVEBitsPerBlock;
}
-static InstructionCost getHistogramCost(const IntrinsicCostAttributes &ICA) {
+static InstructionCost getHistogramCost(const AArch64Subtarget *ST,
+ const IntrinsicCostAttributes &ICA) {
+ // We need to know at least the number of elements in the vector of buckets
+ // and the size of each element to update.
+ if (ICA.getArgTypes().size() < 2)
+ return InstructionCost::getInvalid();
+
+ // Only interested in costing for the hardware instruction from SVE2.
+ if (!ST->hasSVE2())
+ return InstructionCost::getInvalid();
+
Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers
Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements
unsigned TotalHistCnts = 1;
@@ -579,9 +589,11 @@ static InstructionCost getHistogramCost(const IntrinsicCostAttributes &ICA) {
unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize;
TotalHistCnts = EC / NaturalVectorWidth;
+
+ return InstructionCost(BaseHistCntCost * TotalHistCnts);
}
- return InstructionCost(BaseHistCntCost * TotalHistCnts);
+ return InstructionCost::getInvalid();
}
InstructionCost
@@ -597,10 +609,13 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
return InstructionCost::getInvalid();
switch (ICA.getID()) {
- case Intrinsic::experimental_vector_histogram_add:
- if (!ST->hasSVE2())
- return InstructionCost::getInvalid();
- return getHistogramCost(ICA);
+ case Intrinsic::experimental_vector_histogram_add: {
+ InstructionCost HistCost = getHistogramCost(ST, ICA);
+ // If the cost isn't valid, we may still be able to scalarize
+ if (HistCost.isValid())
+ return HistCost;
+ break;
+ }
case Intrinsic::umin:
case Intrinsic::umax:
case Intrinsic::smin:
@@ -3975,6 +3990,27 @@ InstructionCost AArch64TTIImpl::getScalarizationOverhead(
return DemandedElts.popcount() * (Insert + Extract) * VecInstCost;
}
+std::optional<InstructionCost> AArch64TTIImpl::getFP16BF16PromoteCost(
+ Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
+ TTI::OperandValueInfo Op2Info, bool IncludeTrunc,
+ std::function<InstructionCost(Type *)> InstCost) const {
+ if (!Ty->getScalarType()->isHalfTy() && !Ty->getScalarType()->isBFloatTy())
+ return std::nullopt;
+ if (Ty->getScalarType()->isHalfTy() && ST->hasFullFP16())
+ return std::nullopt;
+
+ Type *PromotedTy = Ty->getWithNewType(Type::getFloatTy(Ty->getContext()));
+ InstructionCost Cost = getCastInstrCost(Instruction::FPExt, PromotedTy, Ty,
+ TTI::CastContextHint::None, CostKind);
+ if (!Op1Info.isConstant() && !Op2Info.isConstant())
+ Cost *= 2;
+ Cost += InstCost(PromotedTy);
+ if (IncludeTrunc)
+ Cost += getCastInstrCost(Instruction::FPTrunc, Ty, PromotedTy,
+ TTI::CastContextHint::None, CostKind);
+ return Cost;
+}
+
InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
@@ -3997,6 +4033,18 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
int ISD = TLI->InstructionOpcodeToISD(Opcode);
+ // Increase the cost for half and bfloat types if not architecturally
+ // supported.
+ if (ISD == ISD::FADD || ISD == ISD::FSUB || ISD == ISD::FMUL ||
+ ISD == ISD::FDIV || ISD == ISD::FREM)
+ if (auto PromotedCost = getFP16BF16PromoteCost(
+ Ty, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/true,
+ [&](Type *PromotedTy) {
+ return getArithmeticInstrCost(Opcode, PromotedTy, CostKind,
+ Op1Info, Op2Info);
+ }))
+ return *PromotedCost;
+
switch (ISD) {
default:
return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
@@ -4265,11 +4313,6 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
[[fallthrough]];
case ISD::FADD:
case ISD::FSUB:
- // Increase the cost for half and bfloat types if not architecturally
- // supported.
- if ((Ty->getScalarType()->isHalfTy() && !ST->hasFullFP16()) ||
- (Ty->getScalarType()->isBFloatTy() && !ST->hasBF16()))
- return 2 * LT.first;
if (!Ty->getScalarType()->isFP128Ty())
return LT.first;
[[fallthrough]];
@@ -4371,25 +4414,21 @@ InstructionCost AArch64TTIImpl::getCmpSelInstrCost(
}
if (Opcode == Instruction::FCmp) {
- // Without dedicated instructions we promote f16 + bf16 compares to f32.
- if ((!ST->hasFullFP16() && ValTy->getScalarType()->isHalfTy()) ||
- ValTy->getScalarType()->isBFloatTy()) {
- Type *PromotedTy =
- ValTy->getWithNewType(Type::getFloatTy(ValTy->getContext()));
- InstructionCost Cost =
- getCastInstrCost(Instruction::FPExt, PromotedTy, ValTy,
- TTI::CastContextHint::None, CostKind);
- if (!Op1Info.isConstant() && !Op2Info.isConstant())
- Cost *= 2;
- Cost += getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred, CostKind,
- Op1Info, Op2Info);
- if (ValTy->isVectorTy())
- Cost += getCastInstrCost(
- Instruction::Trunc, VectorType::getInteger(cast<VectorType>(ValTy)),
- VectorType::getInteger(cast<VectorType>(PromotedTy)),
- TTI::CastContextHint::None, CostKind);
- return Cost;
- }
+ if (auto PromotedCost = getFP16BF16PromoteCost(
+ ValTy, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/false,
+ [&](Type *PromotedTy) {
+ InstructionCost Cost =
+ getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred,
+ CostKind, Op1Info, Op2Info);
+ if (isa<VectorType>(PromotedTy))
+ Cost += getCastInstrCost(
+ Instruction::Trunc,
+ VectorType::getInteger(cast<VectorType>(ValTy)),
+ VectorType::getInteger(cast<VectorType>(PromotedTy)),
+ TTI::CastContextHint::None, CostKind);
+ return Cost;
+ }))
+ return *PromotedCost;
auto LT = getTypeLegalizationCost(ValTy);
// Model unknown fp compares as a libcall.
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 7f45177..fa9b25a 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -435,6 +435,14 @@ public:
bool preferPredicatedReductionSelect() const override { return ST->hasSVE(); }
+ /// FP16 and BF16 operations are lowered to fptrunc(op(fpext, fpext) if the
+ /// architecture features are not present.
+ std::optional<InstructionCost>
+ getFP16BF16PromoteCost(Type *Ty, TTI::TargetCostKind CostKind,
+ TTI::OperandValueInfo Op1Info,
+ TTI::OperandValueInfo Op2Info, bool IncludeTrunc,
+ std::function<InstructionCost(Type *)> InstCost) const;
+
InstructionCost
getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
std::optional<FastMathFlags> FMF,
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
index 010d0aaa..2155ace 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
@@ -125,7 +125,7 @@ struct AArch64OutgoingValueAssigner
bool UseVarArgsCCForFixed = IsCalleeWin && State.isVarArg();
bool Res;
- if (Info.IsFixed && !UseVarArgsCCForFixed) {
+ if (!Flags.isVarArg() && !UseVarArgsCCForFixed) {
if (!IsReturn)
applyStackPassedSmallTypeDAGHack(OrigVT, ValVT, LocVT);
Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State);
@@ -361,7 +361,7 @@ struct OutgoingArgHandler : public CallLowering::OutgoingValueHandler {
unsigned MaxSize = MemTy.getSizeInBytes() * 8;
// For varargs, we always want to extend them to 8 bytes, in which case
// we disable setting a max.
- if (!Arg.IsFixed)
+ if (Arg.Flags[0].isVarArg())
MaxSize = 0;
Register ValVReg = Arg.Regs[RegIndex];
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 2a324e5..626734a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -997,89 +997,24 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
const Function &F = MF.getFunction();
// Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
- // dispatch registers are function args.
- unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
-
- if (isShader(F.getCallingConv())) {
- bool IsPixelShader =
- F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();
-
- // Calculate the number of VGPR registers based on the SPI input registers
- uint32_t InputEna = 0;
- uint32_t InputAddr = 0;
- unsigned LastEna = 0;
-
- if (IsPixelShader) {
- // Note for IsPixelShader:
- // By this stage, all enabled inputs are tagged in InputAddr as well.
- // We will use InputAddr to determine whether the input counts against the
- // vgpr total and only use the InputEnable to determine the last input
- // that is relevant - if extra arguments are used, then we have to honour
- // the InputAddr for any intermediate non-enabled inputs.
- InputEna = MFI->getPSInputEnable();
- InputAddr = MFI->getPSInputAddr();
-
- // We only need to consider input args up to the last used arg.
- assert((InputEna || InputAddr) &&
- "PSInputAddr and PSInputEnable should "
- "never both be 0 for AMDGPU_PS shaders");
- // There are some rare circumstances where InputAddr is non-zero and
- // InputEna can be set to 0. In this case we default to setting LastEna
- // to 1.
- LastEna = InputEna ? llvm::Log2_32(InputEna) + 1 : 1;
- }
+ // dispatch registers as function args.
+ unsigned WaveDispatchNumSGPR = MFI->getNumWaveDispatchSGPRs(),
+ WaveDispatchNumVGPR = MFI->getNumWaveDispatchVGPRs();
- // FIXME: We should be using the number of registers determined during
- // calling convention lowering to legalize the types.
- const DataLayout &DL = F.getDataLayout();
- unsigned PSArgCount = 0;
- unsigned IntermediateVGPR = 0;
- for (auto &Arg : F.args()) {
- unsigned NumRegs = (DL.getTypeSizeInBits(Arg.getType()) + 31) / 32;
- if (Arg.hasAttribute(Attribute::InReg)) {
- WaveDispatchNumSGPR += NumRegs;
- } else {
- // If this is a PS shader and we're processing the PS Input args (first
- // 16 VGPR), use the InputEna and InputAddr bits to define how many
- // VGPRs are actually used.
- // Any extra VGPR arguments are handled as normal arguments (and
- // contribute to the VGPR count whether they're used or not).
- if (IsPixelShader && PSArgCount < 16) {
- if ((1 << PSArgCount) & InputAddr) {
- if (PSArgCount < LastEna)
- WaveDispatchNumVGPR += NumRegs;
- else
- IntermediateVGPR += NumRegs;
- }
- PSArgCount++;
- } else {
- // If there are extra arguments we have to include the allocation for
- // the non-used (but enabled with InputAddr) input arguments
- if (IntermediateVGPR) {
- WaveDispatchNumVGPR += IntermediateVGPR;
- IntermediateVGPR = 0;
- }
- WaveDispatchNumVGPR += NumRegs;
- }
- }
- }
+ if (WaveDispatchNumSGPR) {
ProgInfo.NumSGPR = AMDGPUMCExpr::createMax(
- {ProgInfo.NumSGPR, CreateExpr(WaveDispatchNumSGPR)}, Ctx);
+ {ProgInfo.NumSGPR,
+ MCBinaryExpr::createAdd(CreateExpr(WaveDispatchNumSGPR), ExtraSGPRs,
+ Ctx)},
+ Ctx);
+ }
+ if (WaveDispatchNumVGPR) {
ProgInfo.NumArchVGPR = AMDGPUMCExpr::createMax(
{ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx);
ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
- } else if (isKernel(F.getCallingConv()) &&
- MFI->getNumKernargPreloadedSGPRs()) {
- // Consider cases where the total number of UserSGPRs with trailing
- // allocated preload SGPRs, is greater than the number of explicitly
- // referenced SGPRs.
- const MCExpr *UserPlusExtraSGPRs = MCBinaryExpr::createAdd(
- CreateExpr(MFI->getNumUserSGPRs()), ExtraSGPRs, Ctx);
- ProgInfo.NumSGPR =
- AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, UserPlusExtraSGPRs}, Ctx);
}
// Adjust number of registers used to meet default/requested minimum/maximum
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 3d8d274..64a9bde 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -580,6 +580,9 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
++i;
}
+ if (Info->getNumKernargPreloadedSGPRs())
+ Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
+
TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false);
return true;
@@ -743,6 +746,15 @@ bool AMDGPUCallLowering::lowerFormalArguments(
if (!determineAssignments(Assigner, SplitArgs, CCInfo))
return false;
+ if (IsEntryFunc) {
+ // This assumes the registers are allocated by CCInfo in ascending order
+ // with no gaps.
+ Info->setNumWaveDispatchSGPRs(
+ CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
+ Info->setNumWaveDispatchVGPRs(
+ CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
+ }
+
FormalArgHandler Handler(B, MRI);
if (!handleAssignments(Handler, SplitArgs, CCInfo, ArgLocs, B))
return false;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index fb83388..9d6584a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -3212,6 +3212,44 @@ bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
Src = Src.getOperand(0);
}
+ if (Mods != SISrcMods::NONE)
+ return true;
+
+ // Convert various sign-bit masks on integers to src mods. Currently disabled
+ // for 16-bit types as the codegen replaces the operand without adding a
+ // srcmod. This is intentionally finding the cases where we are performing
+ // float neg and abs on int types, the goal is not to obtain two's complement
+ // neg or abs. Limit converison to select operands via the nonCanonalizing
+ // pattern.
+ // TODO: Add 16-bit support.
+ if (IsCanonicalizing)
+ return true;
+
+ unsigned Opc = Src->getOpcode();
+ EVT VT = Src.getValueType();
+ if ((Opc != ISD::AND && Opc != ISD::OR && Opc != ISD::XOR) ||
+ (VT != MVT::i32 && VT != MVT::i64))
+ return true;
+
+ ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Src->getOperand(1));
+ if (!CRHS)
+ return true;
+
+ // Recognise (xor a, 0x80000000) as NEG SrcMod.
+ // Recognise (and a, 0x7fffffff) as ABS SrcMod.
+ // Recognise (or a, 0x80000000) as NEG+ABS SrcModifiers.
+ if (Opc == ISD::XOR && CRHS->getAPIntValue().isSignMask()) {
+ Mods |= SISrcMods::NEG;
+ Src = Src.getOperand(0);
+ } else if (Opc == ISD::AND && AllowAbs &&
+ CRHS->getAPIntValue().isMaxSignedValue()) {
+ Mods |= SISrcMods::ABS;
+ Src = Src.getOperand(0);
+ } else if (Opc == ISD::OR && AllowAbs && CRHS->getAPIntValue().isSignMask()) {
+ Mods |= SISrcMods::ABS | SISrcMods::NEG;
+ Src = Src.getOperand(0);
+ }
+
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index a6e4a63..40d960e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5879,8 +5879,12 @@ AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
const LLT S32 = LLT::scalar(32);
MachineRegisterInfo &MRI = *B.getMRI();
- std::tie(BaseReg, ImmOffset) =
- AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
+ // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
+ // being added, so we can only safely match a 32-bit addition with no unsigned
+ // overflow.
+ bool CheckNUW = AMDGPU::isGFX1250(ST);
+ std::tie(BaseReg, ImmOffset) = AMDGPU::getBaseWithConstantOffset(
+ MRI, OrigOffset, /*KnownBits=*/nullptr, CheckNUW);
// If BaseReg is a pointer, convert it to int.
if (MRI.getType(BaseReg).isPointer())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
index f580f43..c21a9a1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
@@ -109,12 +109,17 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
// Find AV_* registers assigned to AGPRs.
const TargetRegisterClass *VirtRegRC = MRI.getRegClass(VReg);
- if (!TRI.isVectorSuperClass(VirtRegRC))
+ if (!TRI.hasAGPRs(VirtRegRC))
continue;
- const TargetRegisterClass *AssignedRC = TRI.getPhysRegBaseClass(PhysReg);
- if (!TRI.isAGPRClass(AssignedRC))
- continue;
+ const TargetRegisterClass *AssignedRC = VirtRegRC;
+ if (TRI.hasVGPRs(VirtRegRC)) {
+ // If this is an AV register, we have to check if the actual assignment is
+ // to an AGPR
+ AssignedRC = TRI.getPhysRegBaseClass(PhysReg);
+ if (!TRI.isAGPRClass(AssignedRC))
+ continue;
+ }
LiveInterval &LI = LIS.getInterval(VReg);
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index ff8efd2..0d2feeb 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -4933,6 +4933,43 @@ bool AMDGPUAsmParser::validateOpSel(const MCInst &Inst) {
return false;
}
+ // Packed math FP32 instructions typically accept SGPRs or VGPRs as source
+ // operands. On gfx12+, if a source operand uses SGPRs, the HW can only read
+ // the first SGPR and use it for both the low and high operations.
+ if (isPackedFP32Inst(Opc) && isGFX12Plus()) {
+ int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+ int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
+ int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel);
+ int OpSelHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel_hi);
+
+ const MCOperand &Src0 = Inst.getOperand(Src0Idx);
+ const MCOperand &Src1 = Inst.getOperand(Src1Idx);
+ unsigned OpSel = Inst.getOperand(OpSelIdx).getImm();
+ unsigned OpSelHi = Inst.getOperand(OpSelHiIdx).getImm();
+
+ const MCRegisterInfo *TRI = getContext().getRegisterInfo();
+
+ auto VerifyOneSGPR = [OpSel, OpSelHi](unsigned Index) -> bool {
+ unsigned Mask = 1U << Index;
+ return ((OpSel & Mask) == 0) && ((OpSelHi & Mask) == 0);
+ };
+
+ if (Src0.isReg() && isSGPR(Src0.getReg(), TRI) &&
+ !VerifyOneSGPR(/*Index=*/0))
+ return false;
+ if (Src1.isReg() && isSGPR(Src1.getReg(), TRI) &&
+ !VerifyOneSGPR(/*Index=*/1))
+ return false;
+
+ int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
+ if (Src2Idx != -1) {
+ const MCOperand &Src2 = Inst.getOperand(Src2Idx);
+ if (Src2.isReg() && isSGPR(Src2.getReg(), TRI) &&
+ !VerifyOneSGPR(/*Index=*/2))
+ return false;
+ }
+ }
+
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 334afd3..ef63acc 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -368,46 +368,45 @@ static LaneBitmask findUseBetween(unsigned Reg, LaneBitmask LastUseMask,
////////////////////////////////////////////////////////////////////////////////
// GCNRPTarget
-GCNRPTarget::GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP,
- bool CombineVGPRSavings)
- : RP(RP), CombineVGPRSavings(CombineVGPRSavings) {
+GCNRPTarget::GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP)
+ : GCNRPTarget(RP, MF) {
const Function &F = MF.getFunction();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- setRegLimits(ST.getMaxNumSGPRs(F), ST.getMaxNumVGPRs(F), MF);
+ setTarget(ST.getMaxNumSGPRs(F), ST.getMaxNumVGPRs(F));
}
GCNRPTarget::GCNRPTarget(unsigned NumSGPRs, unsigned NumVGPRs,
- const MachineFunction &MF, const GCNRegPressure &RP,
- bool CombineVGPRSavings)
- : RP(RP), CombineVGPRSavings(CombineVGPRSavings) {
- setRegLimits(NumSGPRs, NumVGPRs, MF);
+ const MachineFunction &MF, const GCNRegPressure &RP)
+ : GCNRPTarget(RP, MF) {
+ setTarget(NumSGPRs, NumVGPRs);
}
GCNRPTarget::GCNRPTarget(unsigned Occupancy, const MachineFunction &MF,
- const GCNRegPressure &RP, bool CombineVGPRSavings)
- : RP(RP), CombineVGPRSavings(CombineVGPRSavings) {
+ const GCNRegPressure &RP)
+ : GCNRPTarget(RP, MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
unsigned DynamicVGPRBlockSize =
MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
- setRegLimits(ST.getMaxNumSGPRs(Occupancy, /*Addressable=*/false),
- ST.getMaxNumVGPRs(Occupancy, DynamicVGPRBlockSize), MF);
+ setTarget(ST.getMaxNumSGPRs(Occupancy, /*Addressable=*/false),
+ ST.getMaxNumVGPRs(Occupancy, DynamicVGPRBlockSize));
}
-void GCNRPTarget::setRegLimits(unsigned NumSGPRs, unsigned NumVGPRs,
- const MachineFunction &MF) {
+void GCNRPTarget::setTarget(unsigned NumSGPRs, unsigned NumVGPRs) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- unsigned DynamicVGPRBlockSize =
- MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
MaxSGPRs = std::min(ST.getAddressableNumSGPRs(), NumSGPRs);
MaxVGPRs = std::min(ST.getAddressableNumArchVGPRs(), NumVGPRs);
- MaxUnifiedVGPRs =
- ST.hasGFX90AInsts()
- ? std::min(ST.getAddressableNumVGPRs(DynamicVGPRBlockSize), NumVGPRs)
- : 0;
+ if (UnifiedRF) {
+ unsigned DynamicVGPRBlockSize =
+ MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
+ MaxUnifiedVGPRs =
+ std::min(ST.getAddressableNumVGPRs(DynamicVGPRBlockSize), NumVGPRs);
+ } else {
+ MaxUnifiedVGPRs = 0;
+ }
}
-bool GCNRPTarget::isSaveBeneficial(Register Reg,
- const MachineRegisterInfo &MRI) const {
+bool GCNRPTarget::isSaveBeneficial(Register Reg) const {
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
const TargetRegisterClass *RC = MRI.getRegClass(Reg);
const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
@@ -416,16 +415,19 @@ bool GCNRPTarget::isSaveBeneficial(Register Reg,
return RP.getSGPRNum() > MaxSGPRs;
unsigned NumVGPRs =
SRI->isAGPRClass(RC) ? RP.getAGPRNum() : RP.getArchVGPRNum();
- return isVGPRBankSaveBeneficial(NumVGPRs);
+ // The addressable limit must always be respected.
+ if (NumVGPRs > MaxVGPRs)
+ return true;
+ // For unified RFs, combined VGPR usage limit must be respected as well.
+ return UnifiedRF && RP.getVGPRNum(true) > MaxUnifiedVGPRs;
}
bool GCNRPTarget::satisfied() const {
- if (RP.getSGPRNum() > MaxSGPRs)
+ if (RP.getSGPRNum() > MaxSGPRs || RP.getVGPRNum(false) > MaxVGPRs)
return false;
- if (RP.getVGPRNum(false) > MaxVGPRs &&
- (!CombineVGPRSavings || !satisifiesVGPRBanksTarget()))
+ if (UnifiedRF && RP.getVGPRNum(true) > MaxUnifiedVGPRs)
return false;
- return satisfiesUnifiedTarget();
+ return true;
}
///////////////////////////////////////////////////////////////////////////////
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index ea33a22..a9c58bb 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -186,20 +186,22 @@ public:
/// Sets up the target such that the register pressure starting at \p RP does
/// not show register spilling on function \p MF (w.r.t. the function's
/// mininum target occupancy).
- GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP,
- bool CombineVGPRSavings = false);
+ GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP);
/// Sets up the target such that the register pressure starting at \p RP does
/// not use more than \p NumSGPRs SGPRs and \p NumVGPRs VGPRs on function \p
/// MF.
GCNRPTarget(unsigned NumSGPRs, unsigned NumVGPRs, const MachineFunction &MF,
- const GCNRegPressure &RP, bool CombineVGPRSavings = false);
+ const GCNRegPressure &RP);
/// Sets up the target such that the register pressure starting at \p RP does
/// not prevent achieving an occupancy of at least \p Occupancy on function
/// \p MF.
GCNRPTarget(unsigned Occupancy, const MachineFunction &MF,
- const GCNRegPressure &RP, bool CombineVGPRSavings = false);
+ const GCNRegPressure &RP);
+
+ /// Changes the target (same semantics as constructor).
+ void setTarget(unsigned NumSGPRs, unsigned NumVGPRs);
const GCNRegPressure &getCurrentRP() const { return RP; }
@@ -207,7 +209,7 @@ public:
/// Determines whether saving virtual register \p Reg will be beneficial
/// towards achieving the RP target.
- bool isSaveBeneficial(Register Reg, const MachineRegisterInfo &MRI) const;
+ bool isSaveBeneficial(Register Reg) const;
/// Saves virtual register \p Reg with lanemask \p Mask.
void saveReg(Register Reg, LaneBitmask Mask, const MachineRegisterInfo &MRI) {
@@ -227,15 +229,15 @@ public:
if (Target.MaxUnifiedVGPRs) {
OS << ", " << Target.RP.getVGPRNum(true) << '/' << Target.MaxUnifiedVGPRs
<< " VGPRs (unified)";
- } else if (Target.CombineVGPRSavings) {
- OS << ", " << Target.RP.getArchVGPRNum() + Target.RP.getAGPRNum() << '/'
- << 2 * Target.MaxVGPRs << " VGPRs (combined target)";
}
return OS;
}
#endif
private:
+ const MachineFunction &MF;
+ const bool UnifiedRF;
+
/// Current register pressure.
GCNRegPressure RP;
@@ -246,29 +248,10 @@ private:
/// Target number of overall VGPRs for subtargets with unified RFs. Always 0
/// for subtargets with non-unified RFs.
unsigned MaxUnifiedVGPRs;
- /// Whether we consider that the register allocator will be able to swap
- /// between ArchVGPRs and AGPRs by copying them to a super register class.
- /// Concretely, this allows savings in one of the VGPR banks to help toward
- /// savings in the other VGPR bank.
- bool CombineVGPRSavings;
-
- inline bool satisifiesVGPRBanksTarget() const {
- assert(CombineVGPRSavings && "only makes sense with combined savings");
- return RP.getArchVGPRNum() + RP.getAGPRNum() <= 2 * MaxVGPRs;
- }
-
- /// Always satisified when the subtarget doesn't have a unified RF.
- inline bool satisfiesUnifiedTarget() const {
- return !MaxUnifiedVGPRs || RP.getVGPRNum(true) <= MaxUnifiedVGPRs;
- }
-
- inline bool isVGPRBankSaveBeneficial(unsigned NumVGPRs) const {
- return NumVGPRs > MaxVGPRs || !satisfiesUnifiedTarget() ||
- (CombineVGPRSavings && !satisifiesVGPRBanksTarget());
- }
- void setRegLimits(unsigned MaxSGPRs, unsigned MaxVGPRs,
- const MachineFunction &MF);
+ GCNRPTarget(const GCNRegPressure &RP, const MachineFunction &MF)
+ : MF(MF), UnifiedRF(MF.getSubtarget<GCNSubtarget>().hasGFX90AInsts()),
+ RP(RP) {}
};
///////////////////////////////////////////////////////////////////////////////
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 96d5668..254b75b 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1086,7 +1086,8 @@ bool ClusteredLowOccStage::initGCNSchedStage() {
}
/// Allows to easily filter for this stage's debug output.
-#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << "[PreRARemat] "; X;)
+#define REMAT_PREFIX "[PreRARemat] "
+#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << REMAT_PREFIX; X;)
bool PreRARematStage::initGCNSchedStage() {
// FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for
@@ -1115,10 +1116,15 @@ bool PreRARematStage::initGCNSchedStage() {
rematerialize();
if (GCNTrackers)
DAG.RegionLiveOuts.buildLiveRegMap();
- REMAT_DEBUG(
- dbgs() << "Retrying function scheduling with new min. occupancy of "
- << AchievedOcc << " from rematerializing (original was "
- << DAG.MinOccupancy << ", target was " << TargetOcc << ")\n");
+ REMAT_DEBUG({
+ dbgs() << "Retrying function scheduling with new min. occupancy of "
+ << AchievedOcc << " from rematerializing (original was "
+ << DAG.MinOccupancy;
+ if (TargetOcc)
+ dbgs() << ", target was " << *TargetOcc;
+ dbgs() << ")\n";
+ });
+
if (AchievedOcc > DAG.MinOccupancy) {
DAG.MinOccupancy = AchievedOcc;
SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
@@ -1540,8 +1546,7 @@ bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) {
bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) {
return GCNSchedStage::shouldRevertScheduling(WavesAfter) ||
- mayCauseSpilling(WavesAfter) ||
- (IncreaseOccupancy && WavesAfter < TargetOcc);
+ mayCauseSpilling(WavesAfter) || (TargetOcc && WavesAfter < TargetOcc);
}
bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
@@ -1687,78 +1692,63 @@ bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat,
}
bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
- REMAT_DEBUG({
- dbgs() << "Collecting rematerializable instructions in ";
- MF.getFunction().printAsOperand(dbgs(), false);
- dbgs() << '\n';
- });
+ const Function &F = MF.getFunction();
// Maps optimizable regions (i.e., regions at minimum and register-limited
// occupancy, or regions with spilling) to the target RP we would like to
// reach.
DenseMap<unsigned, GCNRPTarget> OptRegions;
- const Function &F = MF.getFunction();
- unsigned DynamicVGPRBlockSize =
- MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
-
- std::pair<unsigned, unsigned> WavesPerEU = ST.getWavesPerEU(F);
- const unsigned MaxSGPRsNoSpill = ST.getMaxNumSGPRs(F);
- const unsigned MaxVGPRsNoSpill = ST.getMaxNumVGPRs(F);
- const unsigned MaxSGPRsIncOcc =
- ST.getMaxNumSGPRs(DAG.MinOccupancy + 1, false);
- const unsigned MaxVGPRsIncOcc =
- ST.getMaxNumVGPRs(DAG.MinOccupancy + 1, DynamicVGPRBlockSize);
- IncreaseOccupancy = WavesPerEU.second > DAG.MinOccupancy;
-
- // Collect optimizable regions. If there is spilling in any region we will
- // just try to reduce spilling. Otherwise we will try to increase occupancy by
- // one in the whole function.
- for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
- GCNRegPressure &RP = DAG.Pressure[I];
- // We allow ArchVGPR or AGPR savings to count as savings of the other kind
- // of VGPR only when trying to eliminate spilling. We cannot do this when
- // trying to increase occupancy since VGPR class swaps only occur later in
- // the register allocator i.e., the scheduler will not be able to reason
- // about these savings and will not report an increase in the achievable
- // occupancy, triggering rollbacks.
- GCNRPTarget Target(MaxSGPRsNoSpill, MaxVGPRsNoSpill, MF, RP,
- /*CombineVGPRSavings=*/true);
- if (!Target.satisfied() && IncreaseOccupancy) {
- // There is spilling in the region and we were so far trying to increase
- // occupancy. Strop trying that and focus on reducing spilling.
- IncreaseOccupancy = false;
- OptRegions.clear();
- } else if (IncreaseOccupancy) {
- // There is no spilling in the region, try to increase occupancy.
- Target = GCNRPTarget(MaxSGPRsIncOcc, MaxVGPRsIncOcc, MF, RP,
- /*CombineVGPRSavings=*/false);
+ unsigned MaxSGPRs = ST.getMaxNumSGPRs(F);
+ unsigned MaxVGPRs = ST.getMaxNumVGPRs(F);
+ auto ResetTargetRegions = [&]() {
+ OptRegions.clear();
+ for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+ const GCNRegPressure &RP = DAG.Pressure[I];
+ GCNRPTarget Target(MaxSGPRs, MaxVGPRs, MF, RP);
+ if (!Target.satisfied())
+ OptRegions.insert({I, Target});
}
- if (!Target.satisfied())
- OptRegions.insert({I, Target});
- }
- if (OptRegions.empty())
- return false;
+ };
-#ifndef NDEBUG
- if (IncreaseOccupancy) {
- REMAT_DEBUG(dbgs() << "Occupancy minimal (" << DAG.MinOccupancy
- << ") in regions:\n");
+ ResetTargetRegions();
+ if (!OptRegions.empty() || DAG.MinOccupancy >= MFI.getMaxWavesPerEU()) {
+ // In addition to register usage being above addressable limits, occupancy
+ // below the minimum is considered like "spilling" as well.
+ TargetOcc = std::nullopt;
} else {
- REMAT_DEBUG(dbgs() << "Spilling w.r.t. minimum target occupancy ("
- << WavesPerEU.first << ") in regions:\n");
- }
- for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
- if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end())
- REMAT_DEBUG(dbgs() << " [" << I << "] " << OptIt->getSecond() << '\n');
+ // There is no spilling and room to improve occupancy; set up "increased
+ // occupancy targets" for all regions.
+ TargetOcc = DAG.MinOccupancy + 1;
+ unsigned VGPRBlockSize =
+ MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
+ MaxSGPRs = ST.getMaxNumSGPRs(*TargetOcc, false);
+ MaxVGPRs = ST.getMaxNumVGPRs(*TargetOcc, VGPRBlockSize);
+ ResetTargetRegions();
}
-#endif
-
- // When we are reducing spilling, the target is the minimum target number of
- // waves/EU determined by the subtarget. In cases where either one of
- // "amdgpu-num-sgpr" or "amdgpu-num-vgpr" are set on the function, the current
- // minimum region occupancy may be higher than the latter.
- TargetOcc = IncreaseOccupancy ? DAG.MinOccupancy + 1
- : std::max(DAG.MinOccupancy, WavesPerEU.first);
+ REMAT_DEBUG({
+ dbgs() << "Analyzing ";
+ MF.getFunction().printAsOperand(dbgs(), false);
+ dbgs() << ": ";
+ if (OptRegions.empty()) {
+ dbgs() << "no objective to achieve, occupancy is maximal at "
+ << MFI.getMaxWavesPerEU();
+ } else if (!TargetOcc) {
+ dbgs() << "reduce spilling (minimum target occupancy is "
+ << MFI.getMinWavesPerEU() << ')';
+ } else {
+ dbgs() << "increase occupancy from " << DAG.MinOccupancy << " to "
+ << TargetOcc;
+ }
+ dbgs() << '\n';
+ for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+ if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end()) {
+ dbgs() << REMAT_PREFIX << " [" << I << "] " << OptIt->getSecond()
+ << '\n';
+ }
+ }
+ });
+ if (OptRegions.empty())
+ return false;
// Accounts for a reduction in RP in an optimizable region. Returns whether we
// estimate that we have identified enough rematerialization opportunities to
@@ -1767,7 +1757,7 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
auto ReduceRPInRegion = [&](auto OptIt, Register Reg, LaneBitmask Mask,
bool &Progress) -> bool {
GCNRPTarget &Target = OptIt->getSecond();
- if (!Target.isSaveBeneficial(Reg, DAG.MRI))
+ if (!Target.isSaveBeneficial(Reg))
return false;
Progress = true;
Target.saveReg(Reg, Mask, DAG.MRI);
@@ -1876,7 +1866,7 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
}
}
- if (IncreaseOccupancy) {
+ if (TargetOcc) {
// We were trying to increase occupancy but failed, abort the stage.
REMAT_DEBUG(dbgs() << "Cannot increase occupancy\n");
Rematerializations.clear();
@@ -1979,7 +1969,9 @@ void PreRARematStage::rematerialize() {
// All regions impacted by at least one rematerialization must be rescheduled.
// Maximum pressure must also be recomputed for all regions where it changed
// non-predictably and checked against the target occupancy.
- AchievedOcc = TargetOcc;
+ unsigned DynamicVGPRBlockSize =
+ MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
+ AchievedOcc = MFI.getMaxWavesPerEU();
for (auto &[I, OriginalRP] : ImpactedRegions) {
bool IsEmptyRegion = DAG.Regions[I].first == DAG.Regions[I].second;
RescheduleRegions[I] = !IsEmptyRegion;
@@ -2003,9 +1995,8 @@ void PreRARematStage::rematerialize() {
}
}
DAG.Pressure[I] = RP;
- AchievedOcc = std::min(
- AchievedOcc, RP.getOccupancy(ST, MF.getInfo<SIMachineFunctionInfo>()
- ->getDynamicVGPRBlockSize()));
+ AchievedOcc =
+ std::min(AchievedOcc, RP.getOccupancy(ST, DynamicVGPRBlockSize));
}
REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n");
}
@@ -2035,7 +2026,7 @@ void PreRARematStage::finalizeGCNSchedStage() {
// which case we do not want to rollback either (the rescheduling was already
// reverted in PreRARematStage::shouldRevertScheduling in such cases).
unsigned MaxOcc = std::max(AchievedOcc, DAG.MinOccupancy);
- if (!IncreaseOccupancy || MaxOcc >= TargetOcc)
+ if (!TargetOcc || MaxOcc >= *TargetOcc)
return;
REMAT_DEBUG(dbgs() << "Rolling back all rematerializations\n");
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 32139a9..790370f 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -470,15 +470,12 @@ private:
/// After successful stage initialization, indicates which regions should be
/// rescheduled.
BitVector RescheduleRegions;
- /// Target occupancy the stage estimates is reachable through
- /// rematerialization. Greater than or equal to the pre-stage min occupancy.
- unsigned TargetOcc;
+ /// The target occupancy the stage is trying to achieve. Empty when the
+ /// objective is spilling reduction.
+ std::optional<unsigned> TargetOcc;
/// Achieved occupancy *only* through rematerializations (pre-rescheduling).
/// Smaller than or equal to the target occupancy.
unsigned AchievedOcc;
- /// Whether the stage is attempting to increase occupancy in the abscence of
- /// spilling.
- bool IncreaseOccupancy;
/// Returns whether remat can reduce spilling or increase function occupancy
/// by 1 through rematerialization. If it can do one, collects instructions in
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 0c653b1..962c276 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -2081,7 +2081,9 @@ SIFoldOperandsImpl::isClamp(const MachineInstr &MI) const {
case AMDGPU::V_MAX_F16_fake16_e64:
case AMDGPU::V_MAX_F64_e64:
case AMDGPU::V_MAX_NUM_F64_e64:
- case AMDGPU::V_PK_MAX_F16: {
+ case AMDGPU::V_PK_MAX_F16:
+ case AMDGPU::V_MAX_BF16_PSEUDO_e64:
+ case AMDGPU::V_PK_MAX_NUM_BF16: {
if (MI.mayRaiseFPException())
return nullptr;
@@ -2108,8 +2110,10 @@ SIFoldOperandsImpl::isClamp(const MachineInstr &MI) const {
// Having a 0 op_sel_hi would require swizzling the output in the source
// instruction, which we can't do.
- unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1
- : 0u;
+ unsigned UnsetMods =
+ (Op == AMDGPU::V_PK_MAX_F16 || Op == AMDGPU::V_PK_MAX_NUM_BF16)
+ ? SISrcMods::OP_SEL_1
+ : 0u;
if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
return nullptr;
return Src0;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 8f44c03..1b7d65a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3106,6 +3106,15 @@ SDValue SITargetLowering::LowerFormalArguments(
if (!IsKernel) {
CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
+
+ // This assumes the registers are allocated by CCInfo in ascending order
+ // with no gaps.
+ Info->setNumWaveDispatchSGPRs(
+ CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
+ Info->setNumWaveDispatchVGPRs(
+ CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
+ } else if (Info->getNumKernargPreloadedSGPRs()) {
+ Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
}
SmallVector<SDValue, 16> Chains;
@@ -6106,6 +6115,7 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
case MVT::f64:
return true;
case MVT::f16:
+ case MVT::bf16:
return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
default:
break;
@@ -10877,6 +10887,13 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
}
}
+// Return whether the operation has NoUnsignedWrap property.
+static bool isNoUnsignedWrap(SDValue Addr) {
+ return (Addr.getOpcode() == ISD::ADD &&
+ Addr->getFlags().hasNoUnsignedWrap()) ||
+ Addr->getOpcode() == ISD::OR;
+}
+
bool SITargetLowering::shouldPreservePtrArith(const Function &F,
EVT PtrVT) const {
return UseSelectionDAGPTRADD && PtrVT == MVT::i64;
@@ -10898,8 +10915,14 @@ SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
if ((C1 = dyn_cast<ConstantSDNode>(N0)))
N0 = SDValue();
else if (DAG.isBaseWithConstantOffset(N0)) {
- C1 = cast<ConstantSDNode>(N0.getOperand(1));
- N0 = N0.getOperand(0);
+ // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
+ // being added, so we can only safely match a 32-bit addition with no
+ // unsigned overflow.
+ bool CheckNUW = AMDGPU::isGFX1250(*Subtarget);
+ if (!CheckNUW || isNoUnsignedWrap(N0)) {
+ C1 = cast<ConstantSDNode>(N0.getOperand(1));
+ N0 = N0.getOperand(0);
+ }
}
if (C1) {
diff --git a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
index d8fe850..0a68512 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
@@ -51,7 +51,7 @@ static cl::opt<unsigned>
namespace {
enum HardClauseType {
- // For GFX10:
+ // For GFX10 and GFX1250:
// Texture, buffer, global or scratch memory instructions.
HARDCLAUSE_VMEM,
@@ -102,7 +102,8 @@ public:
HardClauseType getHardClauseType(const MachineInstr &MI) {
if (MI.mayLoad() || (MI.mayStore() && ST->shouldClusterStores())) {
- if (ST->getGeneration() == AMDGPUSubtarget::GFX10) {
+ if (ST->getGeneration() == AMDGPUSubtarget::GFX10 ||
+ ST->hasGFX1250Insts()) {
if ((SIInstrInfo::isVMEM(MI) && !SIInstrInfo::isFLAT(MI)) ||
SIInstrInfo::isSegmentSpecificFLAT(MI)) {
if (ST->hasNSAClauseBug()) {
@@ -115,7 +116,6 @@ public:
if (SIInstrInfo::isFLAT(MI))
return HARDCLAUSE_FLAT;
} else {
- assert(ST->getGeneration() >= AMDGPUSubtarget::GFX11);
if (SIInstrInfo::isMIMG(MI)) {
const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index f20b22d..19e6bcf 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -18,6 +18,7 @@
#include "GCNSubtarget.h"
#include "SIMachineFunctionInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
#include "llvm/CodeGen/LiveIntervals.h"
@@ -5534,6 +5535,15 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
}
+ // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
+ // information.
+ if (AMDGPU::isPackedFP32Inst(Opcode) && AMDGPU::isGFX12Plus(ST)) {
+ for (unsigned I = 0; I < 3; ++I) {
+ if (!isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, I))
+ return false;
+ }
+ }
+
return true;
}
@@ -6005,6 +6015,21 @@ bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx,
const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
unsigned Opc = MI.getOpcode();
+ // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
+ // information.
+ if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) &&
+ MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) {
+ constexpr const AMDGPU::OpName OpNames[] = {
+ AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
+
+ for (auto [I, OpName] : enumerate(OpNames)) {
+ int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[I]);
+ if (static_cast<unsigned>(SrcIdx) == OpIdx &&
+ !isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, I, &MO))
+ return false;
+ }
+ }
+
if (!isLegalRegOperand(MRI, OpInfo, MO))
return false;
@@ -6053,6 +6078,39 @@ bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
return true;
}
+bool SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand(
+ const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
+ const MachineOperand *MO) const {
+ constexpr const unsigned NumOps = 3;
+ constexpr const AMDGPU::OpName OpNames[NumOps * 2] = {
+ AMDGPU::OpName::src0, AMDGPU::OpName::src1,
+ AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers,
+ AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};
+
+ assert(SrcN < NumOps);
+
+ if (!MO) {
+ int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[SrcN]);
+ if (SrcIdx == -1)
+ return true;
+ MO = &MI.getOperand(SrcIdx);
+ }
+
+ if (!MO->isReg() || !RI.isSGPRReg(MRI, MO->getReg()))
+ return true;
+
+ int ModsIdx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[NumOps + SrcN]);
+ if (ModsIdx == -1)
+ return true;
+
+ unsigned Mods = MI.getOperand(ModsIdx).getImm();
+ bool OpSel = Mods & SISrcMods::OP_SEL_0;
+ bool OpSelHi = Mods & SISrcMods::OP_SEL_1;
+
+ return !OpSel && !OpSelHi;
+}
+
bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
const MachineOperand *MO) const {
const MachineFunction &MF = *MI.getParent()->getParent();
@@ -6390,6 +6448,15 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
!RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
legalizeOpWithMove(MI, VOP3Idx[2]);
+
+ // Fix the register class of packed FP32 instructions on gfx12+. See
+ // SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information.
+ if (AMDGPU::isPackedFP32Inst(Opc) && AMDGPU::isGFX12Plus(ST)) {
+ for (unsigned I = 0; I < 3; ++I) {
+ if (!isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, /*SrcN=*/I))
+ legalizeOpWithMove(MI, VOP3Idx[I]);
+ }
+ }
}
Register SIInstrInfo::readlaneVGPRToSGPR(
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index e042b59..6b9403f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1287,6 +1287,19 @@ public:
const MachineOperand &MO) const;
bool isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx,
const MachineOperand &MO) const;
+
+ /// Check if \p MO would be a legal operand for gfx12+ packed math FP32
+ /// instructions. Packed math FP32 instructions typically accept SGPRs or
+ /// VGPRs as source operands. On gfx12+, if a source operand uses SGPRs, the
+ /// HW can only read the first SGPR and use it for both the low and high
+ /// operations.
+ /// \p SrcN can be 0, 1, or 2, representing src0, src1, and src2,
+ /// respectively. If \p MO is nullptr, the operand corresponding to SrcN will
+ /// be used.
+ bool isLegalGFX12PlusPackedMathFP32Operand(
+ const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
+ const MachineOperand *MO = nullptr) const;
+
/// Legalize operands in \p MI by either commuting it or inserting a
/// copy of src1.
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 9a1448f..49425d5 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -728,6 +728,8 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
MemoryBound(MFI.isMemoryBound()), WaveLimiter(MFI.needsWaveLimiter()),
HasSpilledSGPRs(MFI.hasSpilledSGPRs()),
HasSpilledVGPRs(MFI.hasSpilledVGPRs()),
+ NumWaveDispatchSGPRs(MFI.getNumWaveDispatchSGPRs()),
+ NumWaveDispatchVGPRs(MFI.getNumWaveDispatchVGPRs()),
HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()),
Occupancy(MFI.getOccupancy()),
ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)),
@@ -784,6 +786,8 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
WaveLimiter = YamlMFI.WaveLimiter;
HasSpilledSGPRs = YamlMFI.HasSpilledSGPRs;
HasSpilledVGPRs = YamlMFI.HasSpilledVGPRs;
+ NumWaveDispatchSGPRs = YamlMFI.NumWaveDispatchSGPRs;
+ NumWaveDispatchVGPRs = YamlMFI.NumWaveDispatchVGPRs;
BytesInStackArgArea = YamlMFI.BytesInStackArgArea;
ReturnsVoid = YamlMFI.ReturnsVoid;
IsWholeWaveFunction = YamlMFI.IsWholeWaveFunction;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 08b0206..ca8f803 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -270,6 +270,8 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
bool WaveLimiter = false;
bool HasSpilledSGPRs = false;
bool HasSpilledVGPRs = false;
+ uint16_t NumWaveDispatchSGPRs = 0;
+ uint16_t NumWaveDispatchVGPRs = 0;
uint32_t HighBitsOf32BitAddress = 0;
// TODO: 10 may be a better default since it's the maximum.
@@ -327,6 +329,8 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
YamlIO.mapOptional("waveLimiter", MFI.WaveLimiter, false);
YamlIO.mapOptional("hasSpilledSGPRs", MFI.HasSpilledSGPRs, false);
YamlIO.mapOptional("hasSpilledVGPRs", MFI.HasSpilledVGPRs, false);
+ YamlIO.mapOptional("numWaveDispatchSGPRs", MFI.NumWaveDispatchSGPRs, false);
+ YamlIO.mapOptional("numWaveDispatchVGPRs", MFI.NumWaveDispatchVGPRs, false);
YamlIO.mapOptional("scratchRSrcReg", MFI.ScratchRSrcReg,
StringValue("$private_rsrc_reg"));
YamlIO.mapOptional("frameOffsetReg", MFI.FrameOffsetReg,
@@ -465,6 +469,9 @@ private:
unsigned NumUserSGPRs = 0;
unsigned NumSystemSGPRs = 0;
+ unsigned NumWaveDispatchSGPRs = 0;
+ unsigned NumWaveDispatchVGPRs = 0;
+
bool HasSpilledSGPRs = false;
bool HasSpilledVGPRs = false;
bool HasNonSpillStackObjects = false;
@@ -991,6 +998,14 @@ public:
return UserSGPRInfo.getNumKernargPreloadSGPRs();
}
+ unsigned getNumWaveDispatchSGPRs() const { return NumWaveDispatchSGPRs; }
+
+ void setNumWaveDispatchSGPRs(unsigned Count) { NumWaveDispatchSGPRs = Count; }
+
+ unsigned getNumWaveDispatchVGPRs() const { return NumWaveDispatchVGPRs; }
+
+ void setNumWaveDispatchVGPRs(unsigned Count) { NumWaveDispatchVGPRs = Count; }
+
Register getPrivateSegmentWaveByteOffsetSystemSGPR() const {
return ArgInfo.PrivateSegmentWaveByteOffset.getRegister();
}
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 00dcb9b..1e3e9a2 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -3318,6 +3318,20 @@ unsigned getLdsDwGranularity(const MCSubtargetInfo &ST) {
return 128;
}
+bool isPackedFP32Inst(unsigned Opc) {
+ switch (Opc) {
+ case AMDGPU::V_PK_ADD_F32:
+ case AMDGPU::V_PK_ADD_F32_gfx12:
+ case AMDGPU::V_PK_MUL_F32:
+ case AMDGPU::V_PK_MUL_F32_gfx12:
+ case AMDGPU::V_PK_FMA_F32:
+ case AMDGPU::V_PK_FMA_F32_gfx12:
+ return true;
+ default:
+ return false;
+ }
+}
+
} // namespace AMDGPU
raw_ostream &operator<<(raw_ostream &OS,
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 1252e35..1bcd36c 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1709,6 +1709,8 @@ bool isArgPassedInSGPR(const Argument *Arg);
bool isArgPassedInSGPR(const CallBase *CB, unsigned ArgNo);
+LLVM_READONLY bool isPackedFP32Inst(unsigned Opc);
+
LLVM_READONLY
bool isLegalSMRDEncodedUnsignedOffset(const MCSubtargetInfo &ST,
int64_t EncodedOffset);
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index ea99cc4..75d3cfa 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -802,6 +802,12 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
setOperationAction(ISD::BSWAP, VT, Expand);
}
+ if (!Subtarget->isThumb1Only() && !Subtarget->hasV8_1MMainlineOps())
+ setOperationAction(ISD::SCMP, MVT::i32, Custom);
+
+ if (!Subtarget->hasV8_1MMainlineOps())
+ setOperationAction(ISD::UCMP, MVT::i32, Custom);
+
setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
@@ -1634,6 +1640,10 @@ bool ARMTargetLowering::useSoftFloat() const {
return Subtarget->useSoftFloat();
}
+bool ARMTargetLowering::shouldExpandCmpUsingSelects(EVT VT) const {
+ return !Subtarget->isThumb1Only() && VT.getSizeInBits() <= 32;
+}
+
// FIXME: It might make sense to define the representative register class as the
// nearest super-register that has a non-null superset. For example, DPR_VFP2 is
// a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
@@ -10612,6 +10622,133 @@ SDValue ARMTargetLowering::LowerFP_TO_BF16(SDValue Op,
return DAG.getBitcast(MVT::i32, Res);
}
+SDValue ARMTargetLowering::LowerCMP(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+
+ // Determine if this is signed or unsigned comparison
+ bool IsSigned = (Op.getOpcode() == ISD::SCMP);
+
+ // Special case for Thumb1 UCMP only
+ if (!IsSigned && Subtarget->isThumb1Only()) {
+ // For Thumb unsigned comparison, use this sequence:
+ // subs r2, r0, r1 ; r2 = LHS - RHS, sets flags
+ // sbc r2, r2 ; r2 = r2 - r2 - !carry
+ // cmp r1, r0 ; compare RHS with LHS
+ // sbc r1, r1 ; r1 = r1 - r1 - !carry
+ // subs r0, r2, r1 ; r0 = r2 - r1 (final result)
+
+ // First subtraction: LHS - RHS
+ SDValue Sub1WithFlags = DAG.getNode(
+ ARMISD::SUBC, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
+ SDValue Sub1Result = Sub1WithFlags.getValue(0);
+ SDValue Flags1 = Sub1WithFlags.getValue(1);
+
+ // SUBE: Sub1Result - Sub1Result - !carry
+ // This gives 0 if LHS >= RHS (unsigned), -1 if LHS < RHS (unsigned)
+ SDValue Sbc1 =
+ DAG.getNode(ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT),
+ Sub1Result, Sub1Result, Flags1);
+ SDValue Sbc1Result = Sbc1.getValue(0);
+
+ // Second comparison: RHS vs LHS (reverse comparison)
+ SDValue CmpFlags = DAG.getNode(ARMISD::CMP, dl, FlagsVT, RHS, LHS);
+
+ // SUBE: RHS - RHS - !carry
+ // This gives 0 if RHS <= LHS (unsigned), -1 if RHS > LHS (unsigned)
+ SDValue Sbc2 = DAG.getNode(
+ ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT), RHS, RHS, CmpFlags);
+ SDValue Sbc2Result = Sbc2.getValue(0);
+
+ // Final subtraction: Sbc1Result - Sbc2Result (no flags needed)
+ SDValue Result =
+ DAG.getNode(ISD::SUB, dl, MVT::i32, Sbc1Result, Sbc2Result);
+ if (Op.getValueType() != MVT::i32)
+ Result = DAG.getSExtOrTrunc(Result, dl, Op.getValueType());
+
+ return Result;
+ }
+
+ // For the ARM assembly pattern:
+ // subs r0, r0, r1 ; subtract RHS from LHS and set flags
+ // movgt r0, #1 ; if LHS > RHS, set result to 1 (GT for signed, HI for
+ // unsigned) mvnlt r0, #0 ; if LHS < RHS, set result to -1 (LT for
+ // signed, LO for unsigned)
+ // ; if LHS == RHS, result remains 0 from the subs
+
+ // Optimization: if RHS is a subtraction against 0, use ADDC instead of SUBC
+ unsigned Opcode = ARMISD::SUBC;
+
+ // Check if RHS is a subtraction against 0: (0 - X)
+ if (RHS.getOpcode() == ISD::SUB) {
+ SDValue SubLHS = RHS.getOperand(0);
+ SDValue SubRHS = RHS.getOperand(1);
+
+ // Check if it's 0 - X
+ if (isNullConstant(SubLHS)) {
+ bool CanUseAdd = false;
+ if (IsSigned) {
+ // For SCMP: only if X is known to never be INT_MIN (to avoid overflow)
+ if (RHS->getFlags().hasNoSignedWrap() || !DAG.computeKnownBits(SubRHS)
+ .getSignedMinValue()
+ .isMinSignedValue()) {
+ CanUseAdd = true;
+ }
+ } else {
+ // For UCMP: only if X is known to never be zero
+ if (DAG.isKnownNeverZero(SubRHS)) {
+ CanUseAdd = true;
+ }
+ }
+
+ if (CanUseAdd) {
+ Opcode = ARMISD::ADDC;
+ RHS = SubRHS; // Replace RHS with X, so we do LHS + X instead of
+ // LHS - (0 - X)
+ }
+ }
+ }
+
+ // Generate the operation with flags
+ SDValue OpWithFlags;
+ if (Opcode == ARMISD::ADDC) {
+ // Use ADDC: LHS + RHS (where RHS was 0 - X, now X)
+ OpWithFlags = DAG.getNode(ARMISD::ADDC, dl,
+ DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
+ } else {
+ // Use ARMISD::SUBC to generate SUBS instruction (subtract with flags)
+ OpWithFlags = DAG.getNode(ARMISD::SUBC, dl,
+ DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
+ }
+
+ SDValue OpResult = OpWithFlags.getValue(0); // The operation result
+ SDValue Flags = OpWithFlags.getValue(1); // The flags
+
+ // Constants for conditional moves
+ SDValue One = DAG.getConstant(1, dl, MVT::i32);
+ SDValue MinusOne = DAG.getAllOnesConstant(dl, MVT::i32);
+
+ // Select condition codes based on signed vs unsigned
+ ARMCC::CondCodes GTCond = IsSigned ? ARMCC::GT : ARMCC::HI;
+ ARMCC::CondCodes LTCond = IsSigned ? ARMCC::LT : ARMCC::LO;
+
+ // First conditional move: if greater than, set to 1
+ SDValue GTCondValue = DAG.getConstant(GTCond, dl, MVT::i32);
+ SDValue Result1 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, OpResult, One,
+ GTCondValue, Flags);
+
+ // Second conditional move: if less than, set to -1
+ SDValue LTCondValue = DAG.getConstant(LTCond, dl, MVT::i32);
+ SDValue Result2 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, Result1, MinusOne,
+ LTCondValue, Flags);
+
+ if (Op.getValueType() != MVT::i32)
+ Result2 = DAG.getSExtOrTrunc(Result2, dl, Op.getValueType());
+
+ return Result2;
+}
+
SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());
switch (Op.getOpcode()) {
@@ -10740,6 +10877,9 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::FP_TO_BF16:
return LowerFP_TO_BF16(Op, DAG);
case ARMISD::WIN__DBZCHK: return SDValue();
+ case ISD::UCMP:
+ case ISD::SCMP:
+ return LowerCMP(Op, DAG);
}
}
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 825145d..a84a3cb 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -607,6 +607,8 @@ class VectorType;
bool preferZeroCompareBranch() const override { return true; }
+ bool shouldExpandCmpUsingSelects(EVT VT) const override;
+
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
bool hasAndNotCompare(SDValue V) const override {
@@ -904,6 +906,7 @@ class VectorType;
void LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const;
SDValue LowerFP_TO_BF16(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerCMP(SDValue Op, SelectionDAG &DAG) const;
Register getRegisterByName(const char* RegName, LLT VT,
const MachineFunction &MF) const override;
diff --git a/llvm/lib/Target/AVR/AVRISelLowering.cpp b/llvm/lib/Target/AVR/AVRISelLowering.cpp
index 3955f2a..25ad9ec 100644
--- a/llvm/lib/Target/AVR/AVRISelLowering.cpp
+++ b/llvm/lib/Target/AVR/AVRISelLowering.cpp
@@ -669,7 +669,7 @@ SDValue AVRTargetLowering::getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
default: {
// Turn lhs < rhs with lhs constant into rhs >= lhs+1, this allows
// us to fold the constant into the cmp instruction.
- RHS = DAG.getConstant(C->getSExtValue() + 1, DL, VT);
+ RHS = DAG.getSignedConstant(C->getSExtValue() + 1, DL, VT);
CC = ISD::SETGE;
break;
}
@@ -713,7 +713,10 @@ SDValue AVRTargetLowering::getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
// Turn lhs < rhs with lhs constant into rhs >= lhs+1, this allows us to
// fold the constant into the cmp instruction.
if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
- RHS = DAG.getConstant(C->getSExtValue() + 1, DL, VT);
+ // Doing a "icmp ugt i16 65535, %0" comparison should have been converted
+ // already to something else. Assert to make sure this assumption holds.
+ assert((!C->isAllOnes()) && "integer overflow in comparison transform");
+ RHS = DAG.getConstant(C->getZExtValue() + 1, DL, VT);
CC = ISD::SETUGE;
break;
}
diff --git a/llvm/lib/Target/DirectX/DXILForwardHandleAccesses.cpp b/llvm/lib/Target/DirectX/DXILForwardHandleAccesses.cpp
index 73abfe7..306db6a 100644
--- a/llvm/lib/Target/DirectX/DXILForwardHandleAccesses.cpp
+++ b/llvm/lib/Target/DirectX/DXILForwardHandleAccesses.cpp
@@ -87,17 +87,50 @@ static bool forwardHandleAccesses(Function &F, DominatorTree &DT) {
for (LoadInst *LI : LoadsToProcess) {
Value *V = LI->getPointerOperand();
- auto *GV = dyn_cast<GlobalVariable>(LI->getPointerOperand());
+ auto *GV = dyn_cast<GlobalVariable>(V);
// If we didn't find the global, we may need to walk through a level of
// indirection. This generally happens at -O0.
- if (!GV)
+ if (!GV) {
if (auto *NestedLI = dyn_cast<LoadInst>(V)) {
BasicBlock::iterator BBI(NestedLI);
Value *Loaded = FindAvailableLoadedValue(
NestedLI, NestedLI->getParent(), BBI, 0, nullptr, nullptr);
GV = dyn_cast_or_null<GlobalVariable>(Loaded);
+ } else if (auto *NestedAlloca = dyn_cast<AllocaInst>(V)) {
+ for (auto &Use : NestedAlloca->uses()) {
+ auto *Store = dyn_cast<StoreInst>(Use.getUser());
+ if (!Store)
+ continue;
+
+ Value *StoredVal = Store->getValueOperand();
+ if (!StoredVal)
+ continue;
+
+ // Try direct global match
+ GV = dyn_cast<GlobalVariable>(StoredVal);
+ if (GV)
+ break;
+
+ // If it's a load, check its source
+ if (auto *Load = dyn_cast<LoadInst>(StoredVal)) {
+ GV = dyn_cast<GlobalVariable>(Load->getPointerOperand());
+ if (GV)
+ break;
+
+ // If loading from an unmodified stack copy of the global, reuse the
+ // global's value. Note: we are just repeating what we are doing for
+ // the load case for the alloca store pattern.
+ BasicBlock::iterator BBI(Load);
+ Value *Loaded = FindAvailableLoadedValue(Load, Load->getParent(),
+ BBI, 0, nullptr, nullptr);
+ GV = dyn_cast<GlobalVariable>(Loaded);
+ if (GV)
+ break;
+ }
+ }
}
+ }
auto It = HandleMap.find(GV);
if (It == HandleMap.end()) {
diff --git a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
index ffd900c..5153d24 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
@@ -56,6 +56,8 @@ bool DirectXTTIImpl::isTargetIntrinsicTriviallyScalarizable(
case Intrinsic::dx_wave_reduce_sum:
case Intrinsic::dx_wave_reduce_umax:
case Intrinsic::dx_wave_reduce_usum:
+ case Intrinsic::dx_imad:
+ case Intrinsic::dx_umad:
return true;
default:
return false;
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index a5bf0e5..6583a0f 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -6729,8 +6729,7 @@ static bool CC_LoongArchAssign2GRLen(unsigned GRLen, CCState &State,
static bool CC_LoongArch(const DataLayout &DL, LoongArchABI::ABI ABI,
unsigned ValNo, MVT ValVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
- CCState &State, bool IsFixed, bool IsRet,
- Type *OrigTy) {
+ CCState &State, bool IsRet, Type *OrigTy) {
unsigned GRLen = DL.getLargestLegalIntTypeSizeInBits();
assert((GRLen == 32 || GRLen == 64) && "Unspport GRLen");
MVT GRLenVT = GRLen == 32 ? MVT::i32 : MVT::i64;
@@ -6752,7 +6751,7 @@ static bool CC_LoongArch(const DataLayout &DL, LoongArchABI::ABI ABI,
case LoongArchABI::ABI_LP64F:
case LoongArchABI::ABI_ILP32D:
case LoongArchABI::ABI_LP64D:
- UseGPRForFloat = !IsFixed;
+ UseGPRForFloat = ArgFlags.isVarArg();
break;
case LoongArchABI::ABI_ILP32S:
case LoongArchABI::ABI_LP64S:
@@ -6766,7 +6765,8 @@ static bool CC_LoongArch(const DataLayout &DL, LoongArchABI::ABI ABI,
// will not be passed by registers if the original type is larger than
// 2*GRLen, so the register alignment rule does not apply.
unsigned TwoGRLenInBytes = (2 * GRLen) / 8;
- if (!IsFixed && ArgFlags.getNonZeroOrigAlign() == TwoGRLenInBytes &&
+ if (ArgFlags.isVarArg() &&
+ ArgFlags.getNonZeroOrigAlign() == TwoGRLenInBytes &&
DL.getTypeAllocSize(OrigTy) == TwoGRLenInBytes) {
unsigned RegIdx = State.getFirstUnallocated(ArgGPRs);
// Skip 'odd' register if necessary.
@@ -6916,7 +6916,7 @@ void LoongArchTargetLowering::analyzeInputArgs(
LoongArchABI::ABI ABI =
MF.getSubtarget<LoongArchSubtarget>().getTargetABI();
if (Fn(MF.getDataLayout(), ABI, i, ArgVT, CCValAssign::Full, Ins[i].Flags,
- CCInfo, /*IsFixed=*/true, IsRet, ArgTy)) {
+ CCInfo, IsRet, ArgTy)) {
LLVM_DEBUG(dbgs() << "InputArg #" << i << " has unhandled type " << ArgVT
<< '\n');
llvm_unreachable("");
@@ -6934,7 +6934,7 @@ void LoongArchTargetLowering::analyzeOutputArgs(
LoongArchABI::ABI ABI =
MF.getSubtarget<LoongArchSubtarget>().getTargetABI();
if (Fn(MF.getDataLayout(), ABI, i, ArgVT, CCValAssign::Full, Outs[i].Flags,
- CCInfo, Outs[i].IsFixed, IsRet, OrigTy)) {
+ CCInfo, IsRet, OrigTy)) {
LLVM_DEBUG(dbgs() << "OutputArg #" << i << " has unhandled type " << ArgVT
<< "\n");
llvm_unreachable("");
@@ -7647,8 +7647,7 @@ bool LoongArchTargetLowering::CanLowerReturn(
LoongArchABI::ABI ABI =
MF.getSubtarget<LoongArchSubtarget>().getTargetABI();
if (CC_LoongArch(MF.getDataLayout(), ABI, i, Outs[i].VT, CCValAssign::Full,
- Outs[i].Flags, CCInfo, /*IsFixed=*/true, /*IsRet=*/true,
- nullptr))
+ Outs[i].Flags, CCInfo, /*IsRet=*/true, nullptr))
return false;
}
return true;
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index 6b49a98f..f79ba74 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -330,7 +330,7 @@ private:
unsigned ValNo, MVT ValVT,
CCValAssign::LocInfo LocInfo,
ISD::ArgFlagsTy ArgFlags, CCState &State,
- bool IsFixed, bool IsRet, Type *OrigTy);
+ bool IsRet, Type *OrigTy);
void analyzeInputArgs(MachineFunction &MF, CCState &CCInfo,
const SmallVectorImpl<ISD::InputArg> &Ins, bool IsRet,
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
index fda9d97..ca5d27d 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
@@ -254,7 +254,8 @@ bool LoongArchAsmBackend::relaxAlign(MCFragment &F, unsigned &Size) {
MCFixup::create(0, Expr, FirstLiteralRelocationKind + ELF::R_LARCH_ALIGN);
F.setVarFixups({Fixup});
F.setLinkerRelaxable();
- F.getParent()->setLinkerRelaxable();
+ if (!F.getParent()->isLinkerRelaxable())
+ F.getParent()->setFirstLinkerRelaxable(F.getLayoutOrder());
return true;
}
diff --git a/llvm/lib/Target/Mips/MipsCCState.cpp b/llvm/lib/Target/Mips/MipsCCState.cpp
index 9e8cd2e..13237c5 100644
--- a/llvm/lib/Target/Mips/MipsCCState.cpp
+++ b/llvm/lib/Target/Mips/MipsCCState.cpp
@@ -128,12 +128,10 @@ void MipsCCState::PreAnalyzeReturnValue(EVT ArgVT) {
OriginalRetWasFloatVector.push_back(originalEVTTypeIsVectorFloat(ArgVT));
}
-void MipsCCState::PreAnalyzeCallOperand(const Type *ArgTy, bool IsFixed,
- const char *Func) {
+void MipsCCState::PreAnalyzeCallOperand(const Type *ArgTy, const char *Func) {
OriginalArgWasF128.push_back(originalTypeIsF128(ArgTy, Func));
OriginalArgWasFloat.push_back(ArgTy->isFloatingPointTy());
OriginalArgWasFloatVector.push_back(ArgTy->isVectorTy());
- CallOperandIsFixed.push_back(IsFixed);
}
/// Identify lowered values that originated from f128, float and sret to vXfXX
@@ -148,7 +146,6 @@ void MipsCCState::PreAnalyzeCallOperands(
OriginalArgWasF128.push_back(originalTypeIsF128(FuncArg.Ty, Func));
OriginalArgWasFloat.push_back(FuncArg.Ty->isFloatingPointTy());
OriginalArgWasFloatVector.push_back(FuncArg.Ty->isVectorTy());
- CallOperandIsFixed.push_back(Outs[i].IsFixed);
}
}
diff --git a/llvm/lib/Target/Mips/MipsCCState.h b/llvm/lib/Target/Mips/MipsCCState.h
index 4229da5..30b68e8 100644
--- a/llvm/lib/Target/Mips/MipsCCState.h
+++ b/llvm/lib/Target/Mips/MipsCCState.h
@@ -36,7 +36,7 @@ public:
static bool originalEVTTypeIsVectorFloat(EVT Ty);
static bool originalTypeIsVectorFloat(const Type *Ty);
- void PreAnalyzeCallOperand(const Type *ArgTy, bool IsFixed, const char *Func);
+ void PreAnalyzeCallOperand(const Type *ArgTy, const char *Func);
void PreAnalyzeFormalArgument(const Type *ArgTy, ISD::ArgFlagsTy Flags);
void PreAnalyzeReturnValue(EVT ArgVT);
@@ -86,10 +86,6 @@ private:
/// vector.
SmallVector<bool, 4> OriginalRetWasFloatVector;
- /// Records whether the value was a fixed argument.
- /// See ISD::OutputArg::IsFixed,
- SmallVector<bool, 4> CallOperandIsFixed;
-
// Used to handle MIPS16-specific calling convention tweaks.
// FIXME: This should probably be a fully fledged calling convention.
SpecialCallingConvType SpecialCallingConv;
@@ -106,7 +102,6 @@ public:
OriginalArgWasF128.clear();
OriginalArgWasFloat.clear();
OriginalArgWasFloatVector.clear();
- CallOperandIsFixed.clear();
PreAnalyzeCallOperands(Outs, FuncArgs, Func);
}
@@ -213,7 +208,6 @@ public:
bool WasOriginalRetVectorFloat(unsigned ValNo) const {
return OriginalRetWasFloatVector[ValNo];
}
- bool IsCallOperandFixed(unsigned ValNo) { return CallOperandIsFixed[ValNo]; }
SpecialCallingConvType getSpecialCallingConv() { return SpecialCallingConv; }
};
}
diff --git a/llvm/lib/Target/Mips/MipsCallLowering.cpp b/llvm/lib/Target/Mips/MipsCallLowering.cpp
index 555773a..fa49108 100644
--- a/llvm/lib/Target/Mips/MipsCallLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsCallLowering.cpp
@@ -47,7 +47,7 @@ struct MipsOutgoingValueAssigner : public CallLowering::OutgoingValueAssigner {
if (IsReturn)
State.PreAnalyzeReturnValue(EVT::getEVT(Info.Ty));
else
- State.PreAnalyzeCallOperand(Info.Ty, Info.IsFixed, Func);
+ State.PreAnalyzeCallOperand(Info.Ty, Func);
return CallLowering::OutgoingValueAssigner::assignArg(
ValNo, OrigVT, ValVT, LocVT, LocInfo, Info, Flags, State);
diff --git a/llvm/lib/Target/Mips/MipsCallingConv.td b/llvm/lib/Target/Mips/MipsCallingConv.td
index 39e184a..0e5c16c 100644
--- a/llvm/lib/Target/Mips/MipsCallingConv.td
+++ b/llvm/lib/Target/Mips/MipsCallingConv.td
@@ -29,12 +29,6 @@ class CCIfOrigArgWasFloat<CCAction A>
class CCIfOrigArgWasF128<CCAction A>
: CCIf<"static_cast<MipsCCState *>(&State)->WasOriginalArgF128(ValNo)", A>;
-/// Match if this specific argument is a vararg.
-/// This is slightly different fro CCIfIsVarArg which matches if any argument is
-/// a vararg.
-class CCIfArgIsVarArg<CCAction A>
- : CCIf<"!static_cast<MipsCCState *>(&State)->IsCallOperandFixed(ValNo)", A>;
-
/// Match if the return was a floating point vector.
class CCIfOrigArgWasNotVectorFloat<CCAction A>
: CCIf<"!static_cast<MipsCCState *>(&State)"
@@ -344,7 +338,7 @@ def CC_Mips_VarArg : CallingConv<[
]>;
def CC_Mips : CallingConv<[
- CCIfVarArg<CCIfArgIsVarArg<CCDelegateTo<CC_Mips_VarArg>>>,
+ CCIfVarArg<CCIfArgVarArg<CCDelegateTo<CC_Mips_VarArg>>>,
CCDelegateTo<CC_Mips_FixedArg>
]>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 15f45a1..d4f0cc9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -900,6 +900,17 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
if (STI.allowFP16Math() || STI.hasBF16Math())
setTargetDAGCombine(ISD::SETCC);
+ // Vector reduction operations. These may be turned into shuffle or tree
+ // reductions depending on what instructions are available for each type.
+ for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
+ MVT EltVT = VT.getVectorElementType();
+ if (EltVT == MVT::f32 || EltVT == MVT::f64) {
+ setOperationAction({ISD::VECREDUCE_FMAX, ISD::VECREDUCE_FMIN,
+ ISD::VECREDUCE_FMAXIMUM, ISD::VECREDUCE_FMINIMUM},
+ VT, Custom);
+ }
+ }
+
// Promote fp16 arithmetic if fp16 hardware isn't available or the
// user passed --nvptx-no-fp16-math. The flag is useful because,
// although sm_53+ GPUs have some sort of FP16 support in
@@ -1143,6 +1154,10 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(NVPTXISD::BFI)
MAKE_CASE(NVPTXISD::PRMT)
MAKE_CASE(NVPTXISD::FCOPYSIGN)
+ MAKE_CASE(NVPTXISD::FMAXNUM3)
+ MAKE_CASE(NVPTXISD::FMINNUM3)
+ MAKE_CASE(NVPTXISD::FMAXIMUM3)
+ MAKE_CASE(NVPTXISD::FMINIMUM3)
MAKE_CASE(NVPTXISD::DYNAMIC_STACKALLOC)
MAKE_CASE(NVPTXISD::STACKRESTORE)
MAKE_CASE(NVPTXISD::STACKSAVE)
@@ -1929,6 +1944,124 @@ static SDValue getPRMT(SDValue A, SDValue B, uint64_t Selector, SDLoc DL,
return getPRMT(A, B, DAG.getConstant(Selector, DL, MVT::i32), DL, DAG, Mode);
}
+/// Reduces the elements using the scalar operations provided. The operations
+/// are sorted descending in number of inputs they take. The flags on the
+/// original reduction operation will be propagated to each scalar operation.
+/// Nearby elements are grouped in tree reduction, unlike the shuffle reduction
+/// used in ExpandReductions and SelectionDAG.
+static SDValue buildTreeReduction(
+ const SmallVector<SDValue> &Elements, EVT EltTy,
+ ArrayRef<std::pair<unsigned /*NodeType*/, unsigned /*NumInputs*/>> Ops,
+ const SDLoc &DL, const SDNodeFlags Flags, SelectionDAG &DAG) {
+ // Build the reduction tree at each level, starting with all the elements.
+ SmallVector<SDValue> Level = Elements;
+
+ unsigned OpIdx = 0;
+ while (Level.size() > 1) {
+ // Try to reduce this level using the current operator.
+ const auto [Op, NumInputs] = Ops[OpIdx];
+
+ // Build the next level by partially reducing all elements.
+ SmallVector<SDValue> ReducedLevel;
+ unsigned I = 0, E = Level.size();
+ for (; I + NumInputs <= E; I += NumInputs) {
+ // Reduce elements in groups of [NumInputs], as much as possible.
+ ReducedLevel.push_back(DAG.getNode(
+ Op, DL, EltTy, ArrayRef<SDValue>(Level).slice(I, NumInputs), Flags));
+ }
+
+ if (I < E) {
+ // Handle leftover elements.
+
+ if (ReducedLevel.empty()) {
+ // We didn't reduce anything at this level. We need to pick a smaller
+ // operator.
+ ++OpIdx;
+ assert(OpIdx < Ops.size() && "no smaller operators for reduction");
+ continue;
+ }
+
+ // We reduced some things but there's still more left, meaning the
+ // operator's number of inputs doesn't evenly divide this level size. Move
+ // these elements to the next level.
+ for (; I < E; ++I)
+ ReducedLevel.push_back(Level[I]);
+ }
+
+ // Process the next level.
+ Level = ReducedLevel;
+ }
+
+ return *Level.begin();
+}
+
+// Get scalar reduction opcode
+static ISD::NodeType getScalarOpcodeForReduction(unsigned ReductionOpcode) {
+ switch (ReductionOpcode) {
+ case ISD::VECREDUCE_FMAX:
+ return ISD::FMAXNUM;
+ case ISD::VECREDUCE_FMIN:
+ return ISD::FMINNUM;
+ case ISD::VECREDUCE_FMAXIMUM:
+ return ISD::FMAXIMUM;
+ case ISD::VECREDUCE_FMINIMUM:
+ return ISD::FMINIMUM;
+ default:
+ llvm_unreachable("unhandled reduction opcode");
+ }
+}
+
+/// Get 3-input scalar reduction opcode
+static std::optional<NVPTXISD::NodeType>
+getScalar3OpcodeForReduction(unsigned ReductionOpcode) {
+ switch (ReductionOpcode) {
+ case ISD::VECREDUCE_FMAX:
+ return NVPTXISD::FMAXNUM3;
+ case ISD::VECREDUCE_FMIN:
+ return NVPTXISD::FMINNUM3;
+ case ISD::VECREDUCE_FMAXIMUM:
+ return NVPTXISD::FMAXIMUM3;
+ case ISD::VECREDUCE_FMINIMUM:
+ return NVPTXISD::FMINIMUM3;
+ default:
+ return std::nullopt;
+ }
+}
+
+/// Lower reductions to either a sequence of operations or a tree if
+/// reassociations are allowed. This method will use larger operations like
+/// max3/min3 when the target supports them.
+SDValue NVPTXTargetLowering::LowerVECREDUCE(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ const SDNodeFlags Flags = Op->getFlags();
+ SDValue Vector = Op.getOperand(0);
+
+ const unsigned Opcode = Op->getOpcode();
+ const EVT EltTy = Vector.getValueType().getVectorElementType();
+
+ // Whether we can use 3-input min/max when expanding the reduction.
+ const bool CanUseMinMax3 =
+ EltTy == MVT::f32 && STI.getSmVersion() >= 100 &&
+ STI.getPTXVersion() >= 88 &&
+ (Opcode == ISD::VECREDUCE_FMAX || Opcode == ISD::VECREDUCE_FMIN ||
+ Opcode == ISD::VECREDUCE_FMAXIMUM || Opcode == ISD::VECREDUCE_FMINIMUM);
+
+ // A list of SDNode opcodes with equivalent semantics, sorted descending by
+ // number of inputs they take.
+ SmallVector<std::pair<unsigned /*Op*/, unsigned /*NumIn*/>, 2> ScalarOps;
+
+ if (auto Opcode3Elem = getScalar3OpcodeForReduction(Opcode);
+ CanUseMinMax3 && Opcode3Elem)
+ ScalarOps.push_back({*Opcode3Elem, 3});
+ ScalarOps.push_back({getScalarOpcodeForReduction(Opcode), 2});
+
+ SmallVector<SDValue> Elements;
+ DAG.ExtractVectorElements(Vector, Elements);
+
+ return buildTreeReduction(Elements, EltTy, ScalarOps, DL, Flags, DAG);
+}
+
SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
// Handle bitcasting from v2i8 without hitting the default promotion
// strategy which goes through stack memory.
@@ -2808,6 +2941,11 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return LowerVECTOR_SHUFFLE(Op, DAG);
case ISD::CONCAT_VECTORS:
return LowerCONCAT_VECTORS(Op, DAG);
+ case ISD::VECREDUCE_FMAX:
+ case ISD::VECREDUCE_FMIN:
+ case ISD::VECREDUCE_FMAXIMUM:
+ case ISD::VECREDUCE_FMINIMUM:
+ return LowerVECREDUCE(Op, DAG);
case ISD::STORE:
return LowerSTORE(Op, DAG);
case ISD::LOAD:
@@ -3908,6 +4046,18 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
return true;
}
+ case Intrinsic::nvvm_prefetch_tensormap: {
+ auto &DL = I.getDataLayout();
+ Info.opc = ISD::INTRINSIC_VOID;
+ Info.memVT = getPointerTy(DL);
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.flags =
+ MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable;
+ Info.align.reset();
+ return true;
+ }
+
case Intrinsic::nvvm_ldu_global_i:
case Intrinsic::nvvm_ldu_global_f:
case Intrinsic::nvvm_ldu_global_p: {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index cf72a1e..43e721a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -64,6 +64,11 @@ enum NodeType : unsigned {
UNPACK_VECTOR,
FCOPYSIGN,
+ FMAXNUM3,
+ FMINNUM3,
+ FMAXIMUM3,
+ FMINIMUM3,
+
DYNAMIC_STACKALLOC,
STACKRESTORE,
STACKSAVE,
@@ -286,6 +291,7 @@ private:
SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index aac611d..1ab41bf 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -347,6 +347,36 @@ multiclass FMINIMUMMAXIMUM<string OpcStr, bit NaN, SDNode OpNode> {
Requires<[hasBF16Math, hasSM<80>, hasPTX<70>]>;
}
+// Template for 3-input minimum/maximum instructions
+// (sm_100+/PTX 8.8 and f32 only)
+//
+// Also defines ftz (flush subnormal inputs and results to sign-preserving
+// zero) variants for fp32 functions.
+multiclass FMINIMUMMAXIMUM3<string OpcStr, bit NaN, SDNode OpNode> {
+ defvar nan_str = !if(NaN, ".NaN", "");
+ def f32rrr :
+ BasicFlagsNVPTXInst<(outs B32:$dst),
+ (ins B32:$a, B32:$b, B32:$c),
+ (ins FTZFlag:$ftz),
+ OpcStr # "$ftz" # nan_str # ".f32",
+ [(set f32:$dst, (OpNode f32:$a, f32:$b, f32:$c))]>,
+ Requires<[hasPTX<88>, hasSM<100>]>;
+ def f32rri :
+ BasicFlagsNVPTXInst<(outs B32:$dst),
+ (ins B32:$a, B32:$b, f32imm:$c),
+ (ins FTZFlag:$ftz),
+ OpcStr # "$ftz" # nan_str # ".f32",
+ [(set f32:$dst, (OpNode f32:$a, f32:$b, fpimm:$c))]>,
+ Requires<[hasPTX<88>, hasSM<100>]>;
+ def f32rii :
+ BasicFlagsNVPTXInst<(outs B32:$dst),
+ (ins B32:$a, f32imm:$b, f32imm:$c),
+ (ins FTZFlag:$ftz),
+ OpcStr # "$ftz" # nan_str # ".f32",
+ [(set f32:$dst, (OpNode f32:$a, fpimm:$b, fpimm:$c))]>,
+ Requires<[hasPTX<88>, hasSM<100>]>;
+}
+
// Template for instructions which take three FP args. The
// instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64").
//
@@ -900,6 +930,20 @@ defm MAX : FMINIMUMMAXIMUM<"max", /* NaN */ false, fmaxnum>;
defm MIN_NAN : FMINIMUMMAXIMUM<"min", /* NaN */ true, fminimum>;
defm MAX_NAN : FMINIMUMMAXIMUM<"max", /* NaN */ true, fmaximum>;
+def nvptx_fminnum3 : SDNode<"NVPTXISD::FMINNUM3", SDTFPTernaryOp,
+ [SDNPCommutative]>;
+def nvptx_fmaxnum3 : SDNode<"NVPTXISD::FMAXNUM3", SDTFPTernaryOp,
+ [SDNPCommutative]>;
+def nvptx_fminimum3 : SDNode<"NVPTXISD::FMINIMUM3", SDTFPTernaryOp,
+ [SDNPCommutative]>;
+def nvptx_fmaximum3 : SDNode<"NVPTXISD::FMAXIMUM3", SDTFPTernaryOp,
+ [SDNPCommutative]>;
+
+defm FMIN3 : FMINIMUMMAXIMUM3<"min", /* NaN */ false, nvptx_fminnum3>;
+defm FMAX3 : FMINIMUMMAXIMUM3<"max", /* NaN */ false, nvptx_fmaxnum3>;
+defm FMINNAN3 : FMINIMUMMAXIMUM3<"min", /* NaN */ true, nvptx_fminimum3>;
+defm FMAXNAN3 : FMINIMUMMAXIMUM3<"max", /* NaN */ true, nvptx_fmaximum3>;
+
defm FABS : F2<"abs", fabs>;
defm FNEG : F2<"neg", fneg>;
defm FABS_H: F2_Support_Half<"abs", fabs>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index d337192..d4a0ca7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -39,6 +39,12 @@ def AS_match {
code global = [{
return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GLOBAL);
}];
+ code const = [{
+ return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_CONST);
+ }];
+ code param = [{
+ return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_PARAM);
+ }];
}
@@ -950,33 +956,47 @@ foreach dim = 3...5 in {
defm TMA_TENSOR_PF_TILE_GATHER4_2D : TMA_TENSOR_PREFETCH_INTR<5, "tile_gather4",
[hasTMACTAGroupSupport]>;
-//Prefetch and Prefetchu
-
-let Predicates = [hasPTX<80>, hasSM<90>] in {
- class PREFETCH_INTRS<string InstName> :
- BasicNVPTXInst<(outs), (ins ADDR:$addr),
- InstName,
- [(!cast<Intrinsic>(!strconcat("int_nvvm_",
- !subst(".", "_", InstName))) addr:$addr)]>;
+//Prefetchu and Prefetch
- def PREFETCH_L1 : PREFETCH_INTRS<"prefetch.L1">;
- def PREFETCH_L2 : PREFETCH_INTRS<"prefetch.L2">;
- def PREFETCH_GLOBAL_L1 : PREFETCH_INTRS<"prefetch.global.L1">;
- def PREFETCH_LOCAL_L1 : PREFETCH_INTRS<"prefetch.local.L1">;
- def PREFETCH_GLOBAL_L2 : PREFETCH_INTRS<"prefetch.global.L2">;
- def PREFETCH_LOCAL_L2 : PREFETCH_INTRS<"prefetch.local.L2">;
+defvar frag_pat = (int_nvvm_prefetch_tensormap node:$addr);
- def PREFETCH_GLOBAL_L2_EVICT_NORMAL : BasicNVPTXInst<(outs), (ins ADDR:$addr),
- "prefetch.global.L2::evict_normal",
- [(int_nvvm_prefetch_global_L2_evict_normal addr:$addr)]>;
+multiclass PREFETCH_TENSORMAP_PATFRAG<string suffix, code predicate> {
+ def !tolower(suffix) : PatFrag<!setdagop(frag_pat, ops), frag_pat, predicate>;
+}
- def PREFETCH_GLOBAL_L2_EVICT_LAST : BasicNVPTXInst<(outs), (ins ADDR:$addr),
- "prefetch.global.L2::evict_last",
- [(int_nvvm_prefetch_global_L2_evict_last addr:$addr)]>;
+defm prefetch_tensormap_ : PREFETCH_TENSORMAP_PATFRAG<"CONST", AS_match.const>;
+defm prefetch_tensormap_ : PREFETCH_TENSORMAP_PATFRAG<"GENERIC", AS_match.generic>;
+defm prefetch_tensormap_ : PREFETCH_TENSORMAP_PATFRAG<"PARAM", AS_match.param>;
- def PREFETCHU_L1 : PREFETCH_INTRS<"prefetchu.L1">;
+multiclass PREFETCH_TENSORMAP_INST<string addrspace_name, PatFrag pattern_frag> {
+ def "" : BasicNVPTXInst<(outs), (ins ADDR:$addr),
+ "prefetch" # addrspace_name # ".tensormap",
+ [(pattern_frag addr:$addr)]>,
+ Requires<[hasPTX<80>, hasSM<90>]>;
}
+defm PREFETCH_CONST_TENSORMAP : PREFETCH_TENSORMAP_INST<".const", prefetch_tensormap_const>;
+defm PREFETCH_GENERIC_TENSORMAP : PREFETCH_TENSORMAP_INST<"", prefetch_tensormap_generic>;
+defm PREFETCH_PARAM_TENSORMAP : PREFETCH_TENSORMAP_INST<".param", prefetch_tensormap_param>;
+
+class PREFETCH_INTRS<string InstName, Intrinsic Intr> :
+ BasicNVPTXInst<(outs), (ins ADDR:$addr),
+ InstName,
+ [(Intr addr:$addr)]>,
+ Requires<[hasPTX<80>, hasSM<90>]>;
+
+def PREFETCHU_L1 : PREFETCH_INTRS<"prefetchu.L1", int_nvvm_prefetchu_L1>;
+def PREFETCH_L1 : PREFETCH_INTRS<"prefetch.L1", int_nvvm_prefetch_L1>;
+def PREFETCH_L2 : PREFETCH_INTRS<"prefetch.L2", int_nvvm_prefetch_L2>;
+def PREFETCH_GLOBAL_L1 : PREFETCH_INTRS<"prefetch.global.L1", int_nvvm_prefetch_global_L1>;
+def PREFETCH_LOCAL_L1 : PREFETCH_INTRS<"prefetch.local.L1", int_nvvm_prefetch_local_L1>;
+def PREFETCH_GLOBAL_L2 : PREFETCH_INTRS<"prefetch.global.L2", int_nvvm_prefetch_global_L2>;
+def PREFETCH_LOCAL_L2 : PREFETCH_INTRS<"prefetch.local.L2", int_nvvm_prefetch_local_L2>;
+def PREFETCH_GLOBAL_L2_EVICT_NORMAL : PREFETCH_INTRS<"prefetch.global.L2::evict_normal",
+ int_nvvm_prefetch_global_L2_evict_normal>;
+def PREFETCH_GLOBAL_L2_EVICT_LAST : PREFETCH_INTRS<"prefetch.global.L2::evict_last",
+ int_nvvm_prefetch_global_L2_evict_last>;
+
//Applypriority intrinsics
class APPLYPRIORITY_L2_INTRS<string addrspace> :
BasicNVPTXInst<(outs), (ins ADDR:$addr, B64:$size),
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index 3ae2d9d..f4f8961 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -564,7 +564,8 @@ bool NVPTXTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
case Intrinsic::nvvm_isspacep_global:
case Intrinsic::nvvm_isspacep_local:
case Intrinsic::nvvm_isspacep_shared:
- case Intrinsic::nvvm_isspacep_shared_cluster: {
+ case Intrinsic::nvvm_isspacep_shared_cluster:
+ case Intrinsic::nvvm_prefetch_tensormap: {
OpIndexes.push_back(0);
return true;
}
@@ -587,6 +588,11 @@ Value *NVPTXTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
return ConstantInt::get(II->getType(), *R);
return nullptr;
}
+ case Intrinsic::nvvm_prefetch_tensormap: {
+ IRBuilder<> Builder(II);
+ return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_prefetch_tensormap,
+ NewV);
+ }
}
return nullptr;
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index 9a6e261..b32d931b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -87,6 +87,13 @@ public:
}
unsigned getMinVectorRegisterBitWidth() const override { return 32; }
+ bool shouldExpandReduction(const IntrinsicInst *II) const override {
+ // Turn off ExpandReductions pass for NVPTX, which doesn't have advanced
+ // swizzling operations. Our backend/Selection DAG can expand these
+ // reductions with less movs.
+ return false;
+ }
+
// We don't want to prevent inlining because of target-cpu and -features
// attributes that were added to newer versions of LLVM/Clang: There are
// no incompatible functions in PTX, ptxas will throw errors in such cases.
diff --git a/llvm/lib/Target/PowerPC/PPCCCState.h b/llvm/lib/Target/PowerPC/PPCCCState.h
index b0e50b2..feab9c5 100644
--- a/llvm/lib/Target/PowerPC/PPCCCState.h
+++ b/llvm/lib/Target/PowerPC/PPCCCState.h
@@ -38,36 +38,6 @@ public:
void clearWasPPCF128() { OriginalArgWasPPCF128.clear(); }
};
-class AIXCCState : public CCState {
-private:
- BitVector IsFixed;
-
-public:
- AIXCCState(CallingConv::ID CC, bool IsVarArg, MachineFunction &MF,
- SmallVectorImpl<CCValAssign> &Locs, LLVMContext &C)
- : CCState(CC, IsVarArg, MF, Locs, C) {}
-
- void AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
- CCAssignFn Fn) {
- // All formal arguments are fixed.
- IsFixed.resize(Ins.size(), true);
- CCState::AnalyzeFormalArguments(Ins, Fn);
- }
-
- void AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
- CCAssignFn Fn) {
- // Record whether the call operand was a fixed argument.
- IsFixed.resize(Outs.size(), false);
- for (unsigned ValNo = 0, E = Outs.size(); ValNo != E; ++ValNo)
- if (Outs[ValNo].IsFixed)
- IsFixed.set(ValNo);
-
- CCState::AnalyzeCallOperands(Outs, Fn);
- }
-
- bool isFixed(unsigned ValNo) const { return IsFixed.test(ValNo); }
-};
-
} // end namespace llvm
#endif
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 196574e..2698bd6 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -6089,7 +6089,7 @@ SDValue PPCTargetLowering::LowerCall_32SVR4(
ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
bool Result;
- if (Outs[i].IsFixed) {
+ if (!ArgFlags.isVarArg()) {
Result = CC_PPC32_SVR4(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags,
CCInfo);
} else {
@@ -6905,8 +6905,7 @@ static bool isGPRShadowAligned(MCPhysReg Reg, Align RequiredAlign) {
static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
- CCState &S) {
- AIXCCState &State = static_cast<AIXCCState &>(S);
+ CCState &State) {
const PPCSubtarget &Subtarget = static_cast<const PPCSubtarget &>(
State.getMachineFunction().getSubtarget());
const bool IsPPC64 = Subtarget.isPPC64();
@@ -7090,7 +7089,7 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
// They are passed in VRs if any are available (unlike arguments passed
// through ellipses) and shadow GPRs (unlike arguments to non-vaarg
// functions)
- if (State.isFixed(ValNo)) {
+ if (!ArgFlags.isVarArg()) {
if (MCRegister VReg = State.AllocateReg(VR)) {
State.addLoc(CCValAssign::getReg(ValNo, ValVT, VReg, LocVT, LocInfo));
// Shadow allocate GPRs and stack space even though we pass in a VR.
@@ -7278,7 +7277,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX(
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
- AIXCCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
+ CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
const EVT PtrVT = getPointerTy(MF.getDataLayout());
// Reserve space for the linkage area on the stack.
@@ -7625,8 +7624,8 @@ SDValue PPCTargetLowering::LowerCall_AIX(
MachineFunction &MF = DAG.getMachineFunction();
SmallVector<CCValAssign, 16> ArgLocs;
- AIXCCState CCInfo(CFlags.CallConv, CFlags.IsVarArg, MF, ArgLocs,
- *DAG.getContext());
+ CCState CCInfo(CFlags.CallConv, CFlags.IsVarArg, MF, ArgLocs,
+ *DAG.getContext());
// Reserve space for the linkage save area (LSA) on the stack.
// In both PPC32 and PPC64 there are 6 reserved slots in the LSA:
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 76dca47..f123040 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -1102,13 +1102,20 @@ void PPCRegisterInfo::lowerCRBitSpilling(MachineBasicBlock::iterator II,
SpillsKnownBit = true;
break;
default:
+ // When spilling a CR bit, the super register may not be explicitly defined
+ // (i.e. it can be defined by a CR-logical that only defines the subreg) so
+ // we state that the CR field is undef. Also, in order to preserve the kill
+ // flag on the CR bit, we add it as an implicit use.
+
// On Power10, we can use SETNBC to spill all CR bits. SETNBC will set all
// bits (specifically, it produces a -1 if the CR bit is set). Ultimately,
// the bit that is of importance to us is bit 32 (bit 0 of a 32-bit
// register), and SETNBC will set this.
if (Subtarget.isISA3_1()) {
BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::SETNBC8 : PPC::SETNBC), Reg)
- .addReg(SrcReg, RegState::Undef);
+ .addReg(SrcReg, RegState::Undef)
+ .addReg(SrcReg, RegState::Implicit |
+ getKillRegState(MI.getOperand(0).isKill()));
break;
}
@@ -1122,16 +1129,14 @@ void PPCRegisterInfo::lowerCRBitSpilling(MachineBasicBlock::iterator II,
SrcReg == PPC::CR4LT || SrcReg == PPC::CR5LT ||
SrcReg == PPC::CR6LT || SrcReg == PPC::CR7LT) {
BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::SETB8 : PPC::SETB), Reg)
- .addReg(getCRFromCRBit(SrcReg), RegState::Undef);
+ .addReg(getCRFromCRBit(SrcReg), RegState::Undef)
+ .addReg(SrcReg, RegState::Implicit |
+ getKillRegState(MI.getOperand(0).isKill()));
break;
}
}
// We need to move the CR field that contains the CR bit we are spilling.
- // The super register may not be explicitly defined (i.e. it can be defined
- // by a CR-logical that only defines the subreg) so we state that the CR
- // field is undef. Also, in order to preserve the kill flag on the CR bit,
- // we add it as an implicit use.
BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFOCRF8 : PPC::MFOCRF), Reg)
.addReg(getCRFromCRBit(SrcReg), RegState::Undef)
.addReg(SrcReg,
diff --git a/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp b/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp
index 95de9f3..4039fed 100644
--- a/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp
@@ -22,3 +22,9 @@ bool PPCSelectionDAGInfo::isTargetStrictFPOpcode(unsigned Opcode) const {
return Opcode >= PPCISD::FIRST_STRICTFP_OPCODE &&
Opcode <= PPCISD::LAST_STRICTFP_OPCODE;
}
+
+std::pair<SDValue, SDValue> PPCSelectionDAGInfo::EmitTargetCodeForMemcmp(
+ SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Op1, SDValue Op2,
+ SDValue Op3, const CallInst *CI) const {
+ return DAG.getMemcmp(Chain, dl, Op1, Op2, Op3, CI);
+}
diff --git a/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.h b/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.h
index 08e2ddb..1537851 100644
--- a/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.h
@@ -20,6 +20,11 @@ public:
bool isTargetMemoryOpcode(unsigned Opcode) const override;
bool isTargetStrictFPOpcode(unsigned Opcode) const override;
+
+ std::pair<SDValue, SDValue>
+ EmitTargetCodeForMemcmp(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain,
+ SDValue Op1, SDValue Op2, SDValue Op3,
+ const CallInst *CI) const override;
};
} // namespace llvm
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index 67cc01e..e0ac591 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -674,6 +674,9 @@ static constexpr FeatureBitset XAndesGroup = {
static constexpr DecoderListEntry DecoderList32[]{
// Vendor Extensions
+ {DecoderTableXCV32, XCVFeatureGroup, "CORE-V extensions"},
+ {DecoderTableXRivos32, XRivosFeatureGroup, "Rivos"},
+ {DecoderTableXqci32, XqciFeatureGroup, "Qualcomm uC Extensions"},
{DecoderTableXVentana32,
{RISCV::FeatureVendorXVentanaCondOps},
"XVentanaCondOps"},
@@ -690,9 +693,6 @@ static constexpr DecoderListEntry DecoderList32[]{
"MIPS mips.pref"},
{DecoderTableXAndes32, XAndesGroup, "Andes extensions"},
// Standard Extensions
- {DecoderTableXCV32, XCVFeatureGroup, "CORE-V extensions"},
- {DecoderTableXqci32, XqciFeatureGroup, "Qualcomm uC Extensions"},
- {DecoderTableXRivos32, XRivosFeatureGroup, "Rivos"},
{DecoderTable32, {}, "standard 32-bit instructions"},
{DecoderTableRV32Only32, {}, "RV32-only standard 32-bit instructions"},
{DecoderTableZfinx32, {}, "Zfinx (Float in Integer)"},
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp
index d2b75a6..34026ed 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp
@@ -45,8 +45,8 @@ public:
CCValAssign::LocInfo LocInfo,
const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags,
CCState &State) override {
- if (RISCVAssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State, Info.IsFixed,
- IsRet, Info.Ty))
+ if (RISCVAssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State, IsRet,
+ Info.Ty))
return true;
StackSize = State.getStackSize();
@@ -196,8 +196,8 @@ public:
if (LocVT.isScalableVector())
MF.getInfo<RISCVMachineFunctionInfo>()->setIsVectorCall();
- if (RISCVAssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State,
- /*IsFixed=*/true, IsRet, Info.Ty))
+ if (RISCVAssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State, IsRet,
+ Info.Ty))
return true;
StackSize = State.getStackSize();
@@ -454,7 +454,7 @@ bool RISCVCallLowering::canLowerReturn(MachineFunction &MF,
for (unsigned I = 0, E = Outs.size(); I < E; ++I) {
MVT VT = MVT::getVT(Outs[I].Ty);
if (CC_RISCV(I, VT, VT, CCValAssign::Full, Outs[I].Flags[0], CCInfo,
- /*IsFixed=*/true, /*isRet=*/true, nullptr))
+ /*isRet=*/true, nullptr))
return false;
}
return true;
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index 95ec42f..8d956ce 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -32,6 +32,11 @@ static cl::opt<bool> ULEB128Reloc(
"riscv-uleb128-reloc", cl::init(true), cl::Hidden,
cl::desc("Emit R_RISCV_SET_ULEB128/E_RISCV_SUB_ULEB128 if appropriate"));
+static cl::opt<bool>
+ AlignRvc("riscv-align-rvc", cl::init(true), cl::Hidden,
+ cl::desc("When generating R_RISCV_ALIGN, insert $alignment-2 "
+ "bytes of NOPs even in norvc code"));
+
RISCVAsmBackend::RISCVAsmBackend(const MCSubtargetInfo &STI, uint8_t OSABI,
bool Is64Bit, const MCTargetOptions &Options)
: MCAsmBackend(llvm::endianness::little), STI(STI), OSABI(OSABI),
@@ -306,12 +311,21 @@ void RISCVAsmBackend::relaxInstruction(MCInst &Inst,
// If conditions are met, compute the padding size and create a fixup encoding
// the padding size in the addend.
bool RISCVAsmBackend::relaxAlign(MCFragment &F, unsigned &Size) {
- // Use default handling unless linker relaxation is enabled and the alignment
- // is larger than the nop size.
- const MCSubtargetInfo *STI = F.getSubtargetInfo();
- if (!STI->hasFeature(RISCV::FeatureRelax))
+ // Alignments before the first linker-relaxable instruction have fixed sizes
+ // and do not require relocations. Alignments after a linker-relaxable
+ // instruction require a relocation, even if the STI specifies norelax.
+ //
+ // firstLinkerRelaxable is the layout order within the subsection, which may
+ // be smaller than the section's order. Therefore, alignments in a
+ // lower-numbered subsection may be unnecessarily treated as linker-relaxable.
+ auto *Sec = F.getParent();
+ if (F.getLayoutOrder() <= Sec->firstLinkerRelaxable())
return false;
- unsigned MinNopLen = STI->hasFeature(RISCV::FeatureStdExtZca) ? 2 : 4;
+
+ // Use default handling unless the alignment is larger than the nop size.
+ const MCSubtargetInfo *STI = F.getSubtargetInfo();
+ unsigned MinNopLen =
+ AlignRvc || STI->hasFeature(RISCV::FeatureStdExtZca) ? 2 : 4;
if (F.getAlignment() <= MinNopLen)
return false;
@@ -321,7 +335,6 @@ bool RISCVAsmBackend::relaxAlign(MCFragment &F, unsigned &Size) {
MCFixup::create(0, Expr, FirstLiteralRelocationKind + ELF::R_RISCV_ALIGN);
F.setVarFixups({Fixup});
F.setLinkerRelaxable();
- F.getParent()->setLinkerRelaxable();
return true;
}
@@ -471,9 +484,12 @@ bool RISCVAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,
Count -= 1;
}
+ // TODO: emit a mapping symbol right here
+
if (Count % 4 == 2) {
- // The canonical nop with Zca is c.nop.
- OS.write(STI->hasFeature(RISCV::FeatureStdExtZca) ? "\x01\0" : "\0\0", 2);
+ // The canonical nop with Zca is c.nop. For .balign 4, we generate a 2-byte
+ // c.nop even in a norvc region.
+ OS.write("\x01\0", 2);
Count -= 2;
}
diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.cpp b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
index cb6117e..70127e3 100644
--- a/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
+++ b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
@@ -324,7 +324,7 @@ static MCRegister allocateRVVReg(MVT ValVT, unsigned ValNo, CCState &State,
// Implements the RISC-V calling convention. Returns true upon failure.
bool llvm::CC_RISCV(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
- CCState &State, bool IsFixed, bool IsRet, Type *OrigTy) {
+ CCState &State, bool IsRet, Type *OrigTy) {
const MachineFunction &MF = State.getMachineFunction();
const DataLayout &DL = MF.getDataLayout();
const RISCVSubtarget &Subtarget = MF.getSubtarget<RISCVSubtarget>();
@@ -379,12 +379,12 @@ bool llvm::CC_RISCV(unsigned ValNo, MVT ValVT, MVT LocVT,
break;
case RISCVABI::ABI_ILP32F:
case RISCVABI::ABI_LP64F:
- UseGPRForF16_F32 = !IsFixed;
+ UseGPRForF16_F32 = ArgFlags.isVarArg();
break;
case RISCVABI::ABI_ILP32D:
case RISCVABI::ABI_LP64D:
- UseGPRForF16_F32 = !IsFixed;
- UseGPRForF64 = !IsFixed;
+ UseGPRForF16_F32 = ArgFlags.isVarArg();
+ UseGPRForF64 = ArgFlags.isVarArg();
break;
}
@@ -465,7 +465,7 @@ bool llvm::CC_RISCV(unsigned ValNo, MVT ValVT, MVT LocVT,
// currently if we are using ILP32E calling convention. This behavior may be
// changed when RV32E/ILP32E is ratified.
unsigned TwoXLenInBytes = (2 * XLen) / 8;
- if (!IsFixed && ArgFlags.getNonZeroOrigAlign() == TwoXLenInBytes &&
+ if (ArgFlags.isVarArg() && ArgFlags.getNonZeroOrigAlign() == TwoXLenInBytes &&
DL.getTypeAllocSize(OrigTy) == TwoXLenInBytes &&
ABI != RISCVABI::ABI_ILP32E) {
unsigned RegIdx = State.getFirstUnallocated(ArgGPRs);
@@ -620,8 +620,8 @@ bool llvm::CC_RISCV(unsigned ValNo, MVT ValVT, MVT LocVT,
// benchmark. But theoretically, it may have benefit for some cases.
bool llvm::CC_RISCV_FastCC(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State,
- bool IsFixed, bool IsRet, Type *OrigTy) {
+ ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsRet,
+ Type *OrigTy) {
const MachineFunction &MF = State.getMachineFunction();
const RISCVSubtarget &Subtarget = MF.getSubtarget<RISCVSubtarget>();
const RISCVTargetLowering &TLI = *Subtarget.getTargetLowering();
diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.h b/llvm/lib/Target/RISCV/RISCVCallingConv.h
index bf823b7..2030ce1 100644
--- a/llvm/lib/Target/RISCV/RISCVCallingConv.h
+++ b/llvm/lib/Target/RISCV/RISCVCallingConv.h
@@ -21,15 +21,15 @@ namespace llvm {
typedef bool RISCVCCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
ISD::ArgFlagsTy ArgFlags, CCState &State,
- bool IsFixed, bool IsRet, Type *OrigTy);
+ bool IsRet, Type *OrigTy);
bool CC_RISCV(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
- CCState &State, bool IsFixed, bool IsRet, Type *OrigTy);
+ CCState &State, bool IsRet, Type *OrigTy);
bool CC_RISCV_FastCC(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
- CCState &State, bool IsFixed, bool IsRet, Type *OrigTy);
+ CCState &State, bool IsRet, Type *OrigTy);
bool CC_RISCV_GHC(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 03e54b3..e63b937 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1844,6 +1844,17 @@ bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
return SetRVVLoadStoreInfo(/*PtrOp*/ I.arg_size() - 3,
/*IsStore*/ true,
/*IsUnitStrided*/ false, /*UsePtrVal*/ true);
+ case Intrinsic::riscv_sseg2_store_mask:
+ case Intrinsic::riscv_sseg3_store_mask:
+ case Intrinsic::riscv_sseg4_store_mask:
+ case Intrinsic::riscv_sseg5_store_mask:
+ case Intrinsic::riscv_sseg6_store_mask:
+ case Intrinsic::riscv_sseg7_store_mask:
+ case Intrinsic::riscv_sseg8_store_mask:
+ // Operands are (vec, ..., vec, ptr, offset, mask, vl)
+ return SetRVVLoadStoreInfo(/*PtrOp*/ I.arg_size() - 4,
+ /*IsStore*/ true,
+ /*IsUnitStrided*/ false, /*UsePtrVal*/ true);
case Intrinsic::riscv_vlm:
return SetRVVLoadStoreInfo(/*PtrOp*/ 0,
/*IsStore*/ false,
@@ -11084,69 +11095,118 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
return lowerVectorIntrinsicScalars(Op, DAG, Subtarget);
}
-SDValue RISCVTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
- SelectionDAG &DAG) const {
- unsigned IntNo = Op.getConstantOperandVal(1);
+static SDValue
+lowerFixedVectorSegStoreIntrinsics(unsigned IntNo, SDValue Op,
+ const RISCVSubtarget &Subtarget,
+ SelectionDAG &DAG) {
+ bool IsStrided;
switch (IntNo) {
- default:
- break;
case Intrinsic::riscv_seg2_store_mask:
case Intrinsic::riscv_seg3_store_mask:
case Intrinsic::riscv_seg4_store_mask:
case Intrinsic::riscv_seg5_store_mask:
case Intrinsic::riscv_seg6_store_mask:
case Intrinsic::riscv_seg7_store_mask:
- case Intrinsic::riscv_seg8_store_mask: {
- SDLoc DL(Op);
- static const Intrinsic::ID VssegInts[] = {
- Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask,
- Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask,
- Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask,
- Intrinsic::riscv_vsseg8_mask};
+ case Intrinsic::riscv_seg8_store_mask:
+ IsStrided = false;
+ break;
+ case Intrinsic::riscv_sseg2_store_mask:
+ case Intrinsic::riscv_sseg3_store_mask:
+ case Intrinsic::riscv_sseg4_store_mask:
+ case Intrinsic::riscv_sseg5_store_mask:
+ case Intrinsic::riscv_sseg6_store_mask:
+ case Intrinsic::riscv_sseg7_store_mask:
+ case Intrinsic::riscv_sseg8_store_mask:
+ IsStrided = true;
+ break;
+ default:
+ llvm_unreachable("unexpected intrinsic ID");
+ }
- // Operands: (chain, int_id, vec*, ptr, mask, vl)
- unsigned NF = Op->getNumOperands() - 5;
- assert(NF >= 2 && NF <= 8 && "Unexpected seg number");
- MVT XLenVT = Subtarget.getXLenVT();
- MVT VT = Op->getOperand(2).getSimpleValueType();
- MVT ContainerVT = getContainerForFixedLengthVector(VT);
- unsigned Sz = NF * ContainerVT.getVectorMinNumElements() *
- ContainerVT.getScalarSizeInBits();
- EVT VecTupTy = MVT::getRISCVVectorTupleVT(Sz, NF);
+ SDLoc DL(Op);
+ static const Intrinsic::ID VssegInts[] = {
+ Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask,
+ Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask,
+ Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask,
+ Intrinsic::riscv_vsseg8_mask};
+ static const Intrinsic::ID VsssegInts[] = {
+ Intrinsic::riscv_vssseg2_mask, Intrinsic::riscv_vssseg3_mask,
+ Intrinsic::riscv_vssseg4_mask, Intrinsic::riscv_vssseg5_mask,
+ Intrinsic::riscv_vssseg6_mask, Intrinsic::riscv_vssseg7_mask,
+ Intrinsic::riscv_vssseg8_mask};
+
+ // Operands: (chain, int_id, vec*, ptr, mask, vl) or
+ // (chain, int_id, vec*, ptr, stride, mask, vl)
+ unsigned NF = Op->getNumOperands() - (IsStrided ? 6 : 5);
+ assert(NF >= 2 && NF <= 8 && "Unexpected seg number");
+ MVT XLenVT = Subtarget.getXLenVT();
+ MVT VT = Op->getOperand(2).getSimpleValueType();
+ MVT ContainerVT = ::getContainerForFixedLengthVector(DAG, VT, Subtarget);
+ unsigned Sz = NF * ContainerVT.getVectorMinNumElements() *
+ ContainerVT.getScalarSizeInBits();
+ EVT VecTupTy = MVT::getRISCVVectorTupleVT(Sz, NF);
- SDValue VL = Op.getOperand(Op.getNumOperands() - 1);
- SDValue Mask = Op.getOperand(Op.getNumOperands() - 2);
- MVT MaskVT = Mask.getSimpleValueType();
- MVT MaskContainerVT =
- ::getContainerForFixedLengthVector(DAG, MaskVT, Subtarget);
- Mask = convertToScalableVector(MaskContainerVT, Mask, DAG, Subtarget);
+ SDValue VL = Op.getOperand(Op.getNumOperands() - 1);
+ SDValue Mask = Op.getOperand(Op.getNumOperands() - 2);
+ MVT MaskVT = Mask.getSimpleValueType();
+ MVT MaskContainerVT =
+ ::getContainerForFixedLengthVector(DAG, MaskVT, Subtarget);
+ Mask = convertToScalableVector(MaskContainerVT, Mask, DAG, Subtarget);
- SDValue IntID = DAG.getTargetConstant(VssegInts[NF - 2], DL, XLenVT);
- SDValue Ptr = Op->getOperand(NF + 2);
+ SDValue IntID = DAG.getTargetConstant(
+ IsStrided ? VsssegInts[NF - 2] : VssegInts[NF - 2], DL, XLenVT);
+ SDValue Ptr = Op->getOperand(NF + 2);
- auto *FixedIntrinsic = cast<MemIntrinsicSDNode>(Op);
+ auto *FixedIntrinsic = cast<MemIntrinsicSDNode>(Op);
- SDValue StoredVal = DAG.getUNDEF(VecTupTy);
- for (unsigned i = 0; i < NF; i++)
- StoredVal = DAG.getNode(
- RISCVISD::TUPLE_INSERT, DL, VecTupTy, StoredVal,
- convertToScalableVector(
- ContainerVT, FixedIntrinsic->getOperand(2 + i), DAG, Subtarget),
- DAG.getTargetConstant(i, DL, MVT::i32));
+ SDValue StoredVal = DAG.getUNDEF(VecTupTy);
+ for (unsigned i = 0; i < NF; i++)
+ StoredVal = DAG.getNode(
+ RISCVISD::TUPLE_INSERT, DL, VecTupTy, StoredVal,
+ convertToScalableVector(ContainerVT, FixedIntrinsic->getOperand(2 + i),
+ DAG, Subtarget),
+ DAG.getTargetConstant(i, DL, MVT::i32));
+
+ SmallVector<SDValue, 10> Ops = {
+ FixedIntrinsic->getChain(),
+ IntID,
+ StoredVal,
+ Ptr,
+ Mask,
+ VL,
+ DAG.getTargetConstant(Log2_64(VT.getScalarSizeInBits()), DL, XLenVT)};
+ // Insert the stride operand.
+ if (IsStrided)
+ Ops.insert(std::next(Ops.begin(), 4),
+ Op.getOperand(Op.getNumOperands() - 3));
+
+ return DAG.getMemIntrinsicNode(
+ ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Ops,
+ FixedIntrinsic->getMemoryVT(), FixedIntrinsic->getMemOperand());
+}
+
+SDValue RISCVTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
+ SelectionDAG &DAG) const {
+ unsigned IntNo = Op.getConstantOperandVal(1);
+ switch (IntNo) {
+ default:
+ break;
+ case Intrinsic::riscv_seg2_store_mask:
+ case Intrinsic::riscv_seg3_store_mask:
+ case Intrinsic::riscv_seg4_store_mask:
+ case Intrinsic::riscv_seg5_store_mask:
+ case Intrinsic::riscv_seg6_store_mask:
+ case Intrinsic::riscv_seg7_store_mask:
+ case Intrinsic::riscv_seg8_store_mask:
+ case Intrinsic::riscv_sseg2_store_mask:
+ case Intrinsic::riscv_sseg3_store_mask:
+ case Intrinsic::riscv_sseg4_store_mask:
+ case Intrinsic::riscv_sseg5_store_mask:
+ case Intrinsic::riscv_sseg6_store_mask:
+ case Intrinsic::riscv_sseg7_store_mask:
+ case Intrinsic::riscv_sseg8_store_mask:
+ return lowerFixedVectorSegStoreIntrinsics(IntNo, Op, Subtarget, DAG);
- SDValue Ops[] = {
- FixedIntrinsic->getChain(),
- IntID,
- StoredVal,
- Ptr,
- Mask,
- VL,
- DAG.getTargetConstant(Log2_64(VT.getScalarSizeInBits()), DL, XLenVT)};
-
- return DAG.getMemIntrinsicNode(
- ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Ops,
- FixedIntrinsic->getMemoryVT(), FixedIntrinsic->getMemOperand());
- }
case Intrinsic::riscv_sf_vc_xv_se:
return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_XV_SE);
case Intrinsic::riscv_sf_vc_iv_se:
@@ -22282,8 +22342,8 @@ void RISCVTargetLowering::analyzeInputArgs(
else if (In.isOrigArg())
ArgTy = FType->getParamType(In.getOrigArgIndex());
- if (Fn(Idx, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo,
- /*IsFixed=*/true, IsRet, ArgTy)) {
+ if (Fn(Idx, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo, IsRet,
+ ArgTy)) {
LLVM_DEBUG(dbgs() << "InputArg #" << Idx << " has unhandled type "
<< ArgVT << '\n');
llvm_unreachable(nullptr);
@@ -22300,8 +22360,8 @@ void RISCVTargetLowering::analyzeOutputArgs(
ISD::ArgFlagsTy ArgFlags = Out.Flags;
Type *OrigTy = CLI ? CLI->getArgs()[Out.OrigArgIndex].Ty : nullptr;
- if (Fn(Idx, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo, Out.IsFixed,
- IsRet, OrigTy)) {
+ if (Fn(Idx, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo, IsRet,
+ OrigTy)) {
LLVM_DEBUG(dbgs() << "OutputArg #" << Idx << " has unhandled type "
<< ArgVT << "\n");
llvm_unreachable(nullptr);
@@ -23083,7 +23143,7 @@ bool RISCVTargetLowering::CanLowerReturn(
MVT VT = Outs[i].VT;
ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
if (CC_RISCV(i, VT, VT, CCValAssign::Full, ArgFlags, CCInfo,
- /*IsFixed=*/true, /*IsRet=*/true, nullptr))
+ /*IsRet=*/true, nullptr))
return false;
}
return true;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index 27ad10a..413ad8b 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -629,9 +629,6 @@ def : Pat<(or (shl (zexti8 (XLenVT GPR:$rs2)), (XLenVT 8)),
def : Pat<(and (or (shl GPR:$rs2, (XLenVT 8)),
(zexti8 (XLenVT GPR:$rs1))), 0xFFFF),
(PACKH GPR:$rs1, GPR:$rs2)>;
-def : Pat<(or (shl (zexti8 (XLenVT GPR:$rs2)), (XLenVT 24)),
- (shl (zexti8 (XLenVT GPR:$rs1)), (XLenVT 16))),
- (SLLI (XLenVT (PACKH GPR:$rs1, GPR:$rs2)), (XLenVT 16))>;
def : Pat<(binop_allhusers<or> (shl GPR:$rs2, (XLenVT 8)),
(zexti8 (XLenVT GPR:$rs1))),
@@ -642,11 +639,15 @@ let Predicates = [HasStdExtZbkb, IsRV32] in {
def : Pat<(i32 (or (zexti16 (i32 GPR:$rs1)), (shl GPR:$rs2, (i32 16)))),
(PACK GPR:$rs1, GPR:$rs2)>;
+def : Pat<(or (shl GPR:$rs2, (XLenVT 24)),
+ (shl (zexti8 (XLenVT GPR:$rs1)), (XLenVT 16))),
+ (SLLI (XLenVT (PACKH GPR:$rs1, GPR:$rs2)), (XLenVT 16))>;
+
// Match a pattern of 2 bytes being inserted into bits [31:16], with bits
// bits [15:0] coming from a zero extended value. We can use pack with packh for
// bits [31:16]. If bits [15:0] can also be a packh, it can be matched
// separately.
-def : Pat<(or (or (shl (zexti8 (XLenVT GPR:$op1rs2)), (XLenVT 24)),
+def : Pat<(or (or (shl GPR:$op1rs2, (XLenVT 24)),
(shl (zexti8 (XLenVT GPR:$op1rs1)), (XLenVT 16))),
(zexti16 (XLenVT GPR:$rs1))),
(PACK (XLenVT GPR:$rs1),
@@ -657,6 +658,13 @@ let Predicates = [HasStdExtZbkb, IsRV64] in {
def : Pat<(i64 (or (zexti32 (i64 GPR:$rs1)), (shl GPR:$rs2, (i64 32)))),
(PACK GPR:$rs1, GPR:$rs2)>;
+def : Pat<(or (shl (zexti8 (XLenVT GPR:$rs2)), (XLenVT 24)),
+ (shl (zexti8 (XLenVT GPR:$rs1)), (XLenVT 16))),
+ (SLLI (XLenVT (PACKH GPR:$rs1, GPR:$rs2)), (XLenVT 16))>;
+def : Pat<(binop_allwusers<or> (shl GPR:$rs2, (XLenVT 24)),
+ (shl (zexti8 (XLenVT GPR:$rs1)), (XLenVT 16))),
+ (SLLI (XLenVT (PACKH GPR:$rs1, GPR:$rs2)), (XLenVT 16))>;
+
def : Pat<(binop_allwusers<or> (shl GPR:$rs2, (i64 16)),
(zexti16 (i64 GPR:$rs1))),
(PACKW GPR:$rs1, GPR:$rs2)>;
@@ -669,7 +677,7 @@ def : Pat<(i64 (or (sext_inreg (shl GPR:$rs2, (i64 16)), i32),
// ignored. We can use packw with packh for bits [31:16]. If bits [15:0] can
// also be a packh, it can be matched separately.
def : Pat<(binop_allwusers<or>
- (or (shl (zexti8 (XLenVT GPR:$op1rs2)), (XLenVT 24)),
+ (or (shl GPR:$op1rs2, (XLenVT 24)),
(shl (zexti8 (XLenVT GPR:$op1rs1)), (XLenVT 16))),
(zexti16 (XLenVT GPR:$rs1))),
(PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>;
@@ -677,11 +685,11 @@ def : Pat<(binop_allwusers<or>
def : Pat<(binop_allwusers<or>
(or (zexti16 (XLenVT GPR:$rs1)),
(shl (zexti8 (XLenVT GPR:$op1rs1)), (XLenVT 16))),
- (shl (zexti8 (XLenVT GPR:$op1rs2)), (XLenVT 24))),
+ (shl GPR:$op1rs2, (XLenVT 24))),
(PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>;
def : Pat<(binop_allwusers<or>
(or (zexti16 (XLenVT GPR:$rs1)),
- (shl (zexti8 (XLenVT GPR:$op1rs1)), (XLenVT 24))),
+ (shl GPR:$op1rs1, (XLenVT 24))),
(shl (zexti8 (XLenVT GPR:$op1rs2)), (XLenVT 16))),
(PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>;
} // Predicates = [HasStdExtZbkb, IsRV64]
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
index bf23812..24ebbc3 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
@@ -13,78 +13,113 @@
//
//===----------------------------------------------------------------------===//
-class SMX60IsWorstCaseMX<string mx, list<string> MxList> {
- string LLMUL = LargestLMUL<MxList>.r;
- bit c = !eq(mx, LLMUL);
-}
+//===----------------------------------------------------------------------===//
+// Helpers
+
+// Maps LMUL string to corresponding value from the Values array
+// LMUL values map to array indices as follows:
+// MF8 -> Values[0], MF4 -> Values[1], MF2 -> Values[2], M1 -> Values[3],
+// M2 -> Values[4], M4 -> Values[5], M8 -> Values[6]
+// Shorter lists are allowed, e.g., widening instructions don't work on M8
+class GetLMULValue<list<int> Values, string LMUL> {
+ defvar Index = !cond(
+ !eq(LMUL, "MF8"): 0,
+ !eq(LMUL, "MF4"): 1,
+ !eq(LMUL, "MF2"): 2,
+ !eq(LMUL, "M1"): 3,
+ !eq(LMUL, "M2"): 4,
+ !eq(LMUL, "M4"): 5,
+ !eq(LMUL, "M8"): 6,
+ );
-class SMX60IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit isF = 0> {
- string LLMUL = LargestLMUL<MxList>.r;
- int SSEW = SmallestSEW<mx, isF>.r;
- bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW));
+ assert !lt(Index, !size(Values)),
+ "Missing LMUL value for '" # LMUL # "'. " #
+ "Expected at least " # !add(Index, 1) # " elements, but got " #
+ !size(Values) # ".";
+
+ int c = Values[Index];
}
-defvar SMX60VLEN = 256;
-defvar SMX60DLEN = !div(SMX60VLEN, 2);
+// Returns BaseValue for LMUL values before startLMUL, Value for startLMUL,
+// then doubles Value for each subsequent LMUL
+// Example: ConstValueUntilLMULThenDoubleBase<"M1", 2, 4, "M8"> returns:
+// MF8->2, MF4->2, MF2->2, M1->4, M2->8, M4->16, M8->32
+// This is useful for modeling scheduling parameters that scale with LMUL.
+class ConstValueUntilLMULThenDoubleBase<string startLMUL, int BaseValue, int Value, string currentLMUL> {
+ assert !le(BaseValue, Value), "BaseValue must be less-equal to Value";
+ defvar startPos = GetLMULValue<[0, 1, 2, 3, 4, 5, 6], startLMUL>.c;
+ defvar currentPos = GetLMULValue<[0, 1, 2, 3, 4, 5, 6], currentLMUL>.c;
+
+ // Calculate the difference in positions
+ defvar posDiff = !sub(currentPos, startPos);
-class Get1248Latency<string mx> {
+ // Calculate Value * (2^posDiff)
int c = !cond(
- !eq(mx, "M2") : 2,
- !eq(mx, "M4") : 4,
- !eq(mx, "M8") : 8,
- true: 1
+ !eq(posDiff, 0) : Value,
+ !eq(posDiff, 1) : !mul(Value, 2),
+ !eq(posDiff, 2) : !mul(Value, 4),
+ !eq(posDiff, 3) : !mul(Value, 8),
+ !eq(posDiff, 4) : !mul(Value, 16),
+ !eq(posDiff, 5) : !mul(Value, 32),
+ !eq(posDiff, 6) : !mul(Value, 64),
+ true : BaseValue
);
}
-// Used for: logical opsz, shifts, sign ext, merge/move, FP sign/recip/convert, mask ops, slides
-class Get4816Latency<string mx> {
- int c = !cond(
- !eq(mx, "M4") : 8,
- !eq(mx, "M8") : 16,
- true: 4
- );
+// Same as the previous function but BaseValue == Value
+class ConstValueUntilLMULThenDouble<string startLMUL, int Value, string currentLMUL> {
+ int c = ConstValueUntilLMULThenDoubleBase<startLMUL, Value, Value, currentLMUL>.c;
+}
+
+// Returns MF8->1, MF4->1, MF2->2, M1->4, M2->8, M4->16, M8->32
+class ConstOneUntilMF4ThenDouble<string mx> {
+ int c = ConstValueUntilLMULThenDouble<"MF4", 1, mx>.c;
}
+// Returns MF8->1, MF4->1, MF2->1, M1->2, M2->4, M4->8, M8->16
+class ConstOneUntilMF2ThenDouble<string mx> {
+ int c = ConstValueUntilLMULThenDouble<"MF2", 1, mx>.c;
+}
+
+// Returns MF8->1, MF4->1, MF2->1, M1->1, M2->2, M4->4, M8->8
+class ConstOneUntilM1ThenDouble<string mx> {
+ int c = ConstValueUntilLMULThenDouble<"M1", 1, mx>.c;
+}
+
+//===----------------------------------------------------------------------===//
+// Latency helper classes
+
// Used for: arithmetic (add/sub/min/max), saturating/averaging, FP add/sub/min/max
-class Get458Latency<string mx> {
- int c = !cond(
- !eq(mx, "M4") : 5,
- !eq(mx, "M8") : 8,
- true: 4
- );
+class Get4458Latency<string mx> {
+ int c = GetLMULValue<[/*MF8=*/4, /*MF4=*/4, /*MF2=*/4, /*M1=*/4, /*M2=*/4, /*M4=*/5, /*M8=*/8], mx>.c;
}
-// Widening scaling pattern (4,4,4,4,5,8,8): plateaus at higher LMULs
-// Used for: widening operations
+// Used for: widening operations (no M8)
class Get4588Latency<string mx> {
- int c = !cond(
- !eq(mx, "M2") : 5,
- !eq(mx, "M4") : 8,
- !eq(mx, "M8") : 8, // M8 not supported for most widening, fallback
- true: 4
- );
+ int c = GetLMULValue<[/*MF8=*/4, /*MF4=*/4, /*MF2=*/4, /*M1=*/4, /*M2=*/5, /*M4=*/8], mx>.c;
}
// Used for: mask-producing comparisons, carry ops with mask, FP comparisons
class Get461018Latency<string mx> {
- int c = !cond(
- !eq(mx, "M2") : 6,
- !eq(mx, "M4") : 10,
- !eq(mx, "M8") : 18,
- true: 4
- );
+ int c = GetLMULValue<[/*MF8=*/4, /*MF4=*/4, /*MF2=*/4, /*M1=*/4, /*M2=*/6, /*M4=*/10, /*M8=*/18], mx>.c;
}
-// Used for: e64 multiply pattern, complex ops
-class Get781632Latency<string mx> {
- int c = !cond(
- !eq(mx, "M2") : 8,
- !eq(mx, "M4") : 16,
- !eq(mx, "M8") : 32,
- true: 7
- );
+//===----------------------------------------------------------------------===//
+
+class SMX60IsWorstCaseMX<string mx, list<string> MxList> {
+ string LLMUL = LargestLMUL<MxList>.r;
+ bit c = !eq(mx, LLMUL);
}
+class SMX60IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit isF = 0> {
+ string LLMUL = LargestLMUL<MxList>.r;
+ int SSEW = SmallestSEW<mx, isF>.r;
+ bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW));
+}
+
+defvar SMX60VLEN = 256;
+defvar SMX60DLEN = !div(SMX60VLEN, 2);
+
def SpacemitX60Model : SchedMachineModel {
let IssueWidth = 2; // dual-issue
let MicroOpBufferSize = 0; // in-order
@@ -383,12 +418,13 @@ foreach LMul = [1, 2, 4, 8] in {
foreach mx = SchedMxList in {
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
- let Latency = Get458Latency<mx>.c, ReleaseAtCycles = [4] in {
+ let Latency = Get4458Latency<mx>.c, ReleaseAtCycles = [4] in {
defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SMX60_VIEU], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SMX60_VIEU], mx, IsWorstCase>;
}
- let Latency = Get4816Latency<mx>.c, ReleaseAtCycles = [4] in {
+ defvar VIALULat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c;
+ let Latency = VIALULat, ReleaseAtCycles = [4] in {
// Pattern of vadd, vsub, vrsub: 4/4/5/8
// Pattern of vand, vor, vxor: 4/4/8/16
// They are grouped together, so we used the worst case 4/4/8/16
@@ -425,7 +461,7 @@ foreach mx = SchedMxList in {
// Pattern of vmacc, vmadd, vmul, vmulh, etc.: e8/e16 = 4/4/5/8, e32 = 5,5,5,8,
// e64 = 7,8,16,32. We use the worst-case until we can split the SEW.
// TODO: change WriteVIMulV, etc to be defined with LMULSEWSchedWrites
- let Latency = Get781632Latency<mx>.c, ReleaseAtCycles = [7] in {
+ let Latency = ConstValueUntilLMULThenDoubleBase<"M2", 7, 8, mx>.c, ReleaseAtCycles = [7] in {
defm "" : LMULWriteResMX<"WriteVIMulV", [SMX60_VIEU], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVIMulX", [SMX60_VIEU], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVIMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
@@ -461,15 +497,8 @@ foreach mx = SchedMxList in {
foreach sew = SchedSEWSet<mx>.val in {
defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
- // Slightly reduced for fractional LMULs
- defvar Multiplier = !cond(
- !eq(mx, "MF8") : 12,
- !eq(mx, "MF4") : 12,
- !eq(mx, "MF2") : 12,
- true: 24
- );
-
- let Latency = !mul(Get1248Latency<mx>.c, Multiplier), ReleaseAtCycles = [12] in {
+ defvar VIDivLat = ConstValueUntilLMULThenDouble<"MF2", 12, mx>.c;
+ let Latency = VIDivLat, ReleaseAtCycles = [12] in {
defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [SMX60_VIEU], mx, sew, IsWorstCase>;
defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [SMX60_VIEU], mx, sew, IsWorstCase>;
}
@@ -480,14 +509,8 @@ foreach mx = SchedMxList in {
foreach mx = SchedMxListW in {
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c;
- // Slightly increased for integer LMULs
- defvar Multiplier = !cond(
- !eq(mx, "M2") : 2,
- !eq(mx, "M4") : 2,
- true: 1
- );
-
- let Latency = !mul(Get4816Latency<mx>.c, Multiplier), ReleaseAtCycles = [4] in {
+ defvar VNarrowingLat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c;
+ let Latency = VNarrowingLat, ReleaseAtCycles = [4] in {
defm "" : LMULWriteResMX<"WriteVNShiftV", [SMX60_VIEU], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVNShiftX", [SMX60_VIEU], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVNShiftI", [SMX60_VIEU], mx, IsWorstCase>;
@@ -501,16 +524,33 @@ foreach mx = SchedMxListW in {
foreach mx = SchedMxList in {
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
- defm "" : LMULWriteResMX<"WriteVSALUV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSALUX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSALUI", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVAALUV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVAALUX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSMulV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSMulX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSShiftV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSShiftX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSShiftI", [SMX60_VIEU], mx, IsWorstCase>;
+ let Latency = Get4458Latency<mx>.c, ReleaseAtCycles = [ConstOneUntilM1ThenDouble<mx>.c] in {
+ defm "" : LMULWriteResMX<"WriteVSALUV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSALUX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSALUI", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVAALUV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVAALUX", [SMX60_VIEU], mx, IsWorstCase>;
+ }
+
+ // Latency of vsmul: e8/e16 = 4/4/5/8, e32 = 5/5/5/8, e64 = 7/8/16/32
+ // We use the worst-case until we can split the SEW.
+ defvar VSMulLat = ConstValueUntilLMULThenDoubleBase<"M2", 7, 8, mx>.c;
+ // Latency of vsmul: e8/e16/e32 = 1/2/4/8, e64 = 4/8/16/32
+ // We use the worst-case until we can split the SEW.
+ defvar VSMulOcc = ConstValueUntilLMULThenDoubleBase<"M1", 1, 4, mx>.c;
+ // TODO: change WriteVSMulV/X to be defined with LMULSEWSchedWrites
+ let Latency = VSMulLat, ReleaseAtCycles = [VSMulOcc] in {
+ defm "" : LMULWriteResMX<"WriteVSMulV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSMulX", [SMX60_VIEU], mx, IsWorstCase>;
+ }
+
+ defvar VSShiftLat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c;
+ defvar VSShiftOcc = ConstOneUntilMF2ThenDouble<mx>.c;
+ let Latency = VSShiftLat, ReleaseAtCycles = [VSShiftOcc] in {
+ defm "" : LMULWriteResMX<"WriteVSShiftV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSShiftX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSShiftI", [SMX60_VIEU], mx, IsWorstCase>;
+ }
}
// 13. Vector Floating-Point Instructions
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 05d504c..6a1f4b3 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -114,6 +114,9 @@ public:
bool enableScalableVectorization() const override {
return ST->hasVInstructions();
}
+ bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const override {
+ return ST->hasVInstructions();
+ }
TailFoldingStyle
getPreferredTailFoldingStyle(bool IVUpdateMayOverflow) const override {
return ST->hasVInstructions() ? TailFoldingStyle::DataWithEVL
diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
index 74aec4f..2b34f61 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
@@ -359,18 +359,15 @@ static void lowerExpectAssume(IntrinsicInst *II) {
}
}
-static bool toSpvOverloadedIntrinsic(IntrinsicInst *II, Intrinsic::ID NewID,
- ArrayRef<unsigned> OpNos) {
- Function *F = nullptr;
- if (OpNos.empty()) {
- F = Intrinsic::getOrInsertDeclaration(II->getModule(), NewID);
- } else {
- SmallVector<Type *, 4> Tys;
- for (unsigned OpNo : OpNos)
- Tys.push_back(II->getOperand(OpNo)->getType());
- F = Intrinsic::getOrInsertDeclaration(II->getModule(), NewID, Tys);
- }
- II->setCalledFunction(F);
+static bool toSpvLifetimeIntrinsic(IntrinsicInst *II, Intrinsic::ID NewID) {
+ IRBuilder<> Builder(II);
+ auto *Alloca = cast<AllocaInst>(II->getArgOperand(0));
+ std::optional<TypeSize> Size =
+ Alloca->getAllocationSize(Alloca->getDataLayout());
+ Value *SizeVal = Builder.getInt64(Size ? *Size : -1);
+ Builder.CreateIntrinsic(NewID, Alloca->getType(),
+ {SizeVal, II->getArgOperand(0)});
+ II->eraseFromParent();
return true;
}
@@ -406,8 +403,8 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) {
break;
case Intrinsic::lifetime_start:
if (!STI.isShader()) {
- Changed |= toSpvOverloadedIntrinsic(
- II, Intrinsic::SPVIntrinsics::spv_lifetime_start, {1});
+ Changed |= toSpvLifetimeIntrinsic(
+ II, Intrinsic::SPVIntrinsics::spv_lifetime_start);
} else {
II->eraseFromParent();
Changed = true;
@@ -415,8 +412,8 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) {
break;
case Intrinsic::lifetime_end:
if (!STI.isShader()) {
- Changed |= toSpvOverloadedIntrinsic(
- II, Intrinsic::SPVIntrinsics::spv_lifetime_end, {1});
+ Changed |= toSpvLifetimeIntrinsic(
+ II, Intrinsic::SPVIntrinsics::spv_lifetime_end);
} else {
II->eraseFromParent();
Changed = true;
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
index 1aa8efe..c0fc3a6 100644
--- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -1179,7 +1179,7 @@ static void fixupVariableFloatArgs(SmallVectorImpl<CCValAssign> &ArgLocs,
if (!VA.isRegLoc() || (ValTy != MVT::f64 && ValTy != MVT::f128))
continue;
// The fixed arguments to a varargs function still go in FP registers.
- if (Outs[VA.getValNo()].IsFixed)
+ if (!Outs[VA.getValNo()].Flags.isVarArg())
continue;
// This floating point argument should be reassigned.
diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.h b/llvm/lib/Target/SystemZ/SystemZCallingConv.h
index 25f4aac..fbb98ff 100644
--- a/llvm/lib/Target/SystemZ/SystemZCallingConv.h
+++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.h
@@ -31,10 +31,6 @@ namespace SystemZ {
class SystemZCCState : public CCState {
private:
- /// Records whether the value was a fixed argument.
- /// See ISD::OutputArg::IsFixed.
- SmallVector<bool, 4> ArgIsFixed;
-
/// Records whether the value was widened from a short vector type.
SmallVector<bool, 4> ArgIsShortVector;
@@ -50,10 +46,6 @@ public:
void AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
CCAssignFn Fn) {
- // Formal arguments are always fixed.
- ArgIsFixed.clear();
- for (unsigned i = 0; i < Ins.size(); ++i)
- ArgIsFixed.push_back(true);
// Record whether the call operand was a short vector.
ArgIsShortVector.clear();
for (unsigned i = 0; i < Ins.size(); ++i)
@@ -64,10 +56,6 @@ public:
void AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
CCAssignFn Fn) {
- // Record whether the call operand was a fixed argument.
- ArgIsFixed.clear();
- for (unsigned i = 0; i < Outs.size(); ++i)
- ArgIsFixed.push_back(Outs[i].IsFixed);
// Record whether the call operand was a short vector.
ArgIsShortVector.clear();
for (unsigned i = 0; i < Outs.size(); ++i)
@@ -77,12 +65,11 @@ public:
}
// This version of AnalyzeCallOperands in the base class is not usable
- // since we must provide a means of accessing ISD::OutputArg::IsFixed.
+ // since we must provide a means of accessing ISD::OutputArg::IsShortVector.
void AnalyzeCallOperands(const SmallVectorImpl<MVT> &Outs,
SmallVectorImpl<ISD::ArgFlagsTy> &Flags,
CCAssignFn Fn) = delete;
- bool IsFixed(unsigned ValNo) { return ArgIsFixed[ValNo]; }
bool IsShortVector(unsigned ValNo) { return ArgIsShortVector[ValNo]; }
};
diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
index 0ad872b..059f31f 100644
--- a/llvm/lib/Target/SystemZ/SystemZCallingConv.td
+++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
@@ -16,14 +16,6 @@ class CCIfSubtarget<string F, CCAction A>
"getSubtarget<SystemZSubtarget>().", F),
A>;
-// Match if this specific argument is a fixed (i.e. named) argument.
-class CCIfFixed<CCAction A>
- : CCIf<"static_cast<SystemZCCState *>(&State)->IsFixed(ValNo)", A>;
-
-// Match if this specific argument is not a fixed (i.e. vararg) argument.
-class CCIfNotFixed<CCAction A>
- : CCIf<"!(static_cast<SystemZCCState *>(&State)->IsFixed(ValNo))", A>;
-
// Match if this specific argument was widened from a short vector type.
class CCIfShortVector<CCAction A>
: CCIf<"static_cast<SystemZCCState *>(&State)->IsShortVector(ValNo)", A>;
@@ -79,7 +71,7 @@ def CC_SystemZ_GHC : CallingConv<[
// Pass in STG registers: XMM1, ..., XMM6
CCIfSubtarget<"hasVector()",
CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
- CCIfFixed<CCAssignToReg<[V16, V17, V18, V19, V20, V21]>>>>,
+ CCIfArgFixed<CCAssignToReg<[V16, V17, V18, V19, V20, V21]>>>>,
// Fail otherwise
CCCustom<"CC_SystemZ_GHC_Error">
@@ -125,8 +117,8 @@ def CC_SystemZ_ELF : CallingConv<[
// during type legalization.
CCIfSubtarget<"hasVector()",
CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
- CCIfFixed<CCAssignToReg<[V24, V26, V28, V30,
- V25, V27, V29, V31]>>>>,
+ CCIfArgFixed<CCAssignToReg<[V24, V26, V28, V30,
+ V25, V27, V29, V31]>>>>,
// However, sub-128 vectors which need to go on the stack occupy just a
// single 8-byte-aligned 8-byte stack slot. Pass as i64.
@@ -227,17 +219,17 @@ def CC_SystemZ_XPLINK64 : CallingConv<[
// Promote f32 to f64 and bitcast to i64, if it needs to be passed in GPRs.
// Although we assign the f32 vararg to be bitcast, it will first be promoted
// to an f64 within convertValVTToLocVT().
- CCIfType<[f32, f64], CCIfNotFixed<CCBitConvertToType<i64>>>,
+ CCIfType<[f32, f64], CCIfArgVarArg<CCBitConvertToType<i64>>>,
// Pointers are always passed in full 64-bit registers.
CCIfPtr<CCCustom<"CC_XPLINK64_Pointer">>,
// long double, can only be passed in GPR2 and GPR3, if available,
// hence R2Q
- CCIfType<[f128], CCIfNotFixed<CCCustom<"CC_XPLINK64_Allocate128BitVararg">>>,
+ CCIfType<[f128], CCIfArgVarArg<CCCustom<"CC_XPLINK64_Allocate128BitVararg">>>,
// Non fixed vector arguments are treated in the same way as long
// doubles.
CCIfSubtarget<"hasVector()",
CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
- CCIfNotFixed<CCCustom<"CC_XPLINK64_Allocate128BitVararg">>>>,
+ CCIfArgVarArg<CCCustom<"CC_XPLINK64_Allocate128BitVararg">>>>,
// A SwiftSelf is passed in callee-saved R10.
CCIfSwiftSelf<CCIfType<[i64], CCAssignToReg<[R10D]>>>,
@@ -260,22 +252,24 @@ def CC_SystemZ_XPLINK64 : CallingConv<[
// during type legalization.
CCIfSubtarget<"hasVector()",
CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
- CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>>,
+ CCIfArgFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>>,
CCIfSubtarget<"hasVector()",
CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
- CCIfFixed<CCAssignToRegAndStack<[V24, V25, V26, V27,
- V28, V29, V30, V31], 16, 8>>>>,
+ CCIfArgFixed<CCAssignToRegAndStack<[V24, V25, V26, V27,
+ V28, V29, V30, V31], 16, 8>>>>,
// The first 4 named float and double arguments are passed in registers
// FPR0-FPR6. The rest will be passed in the user area.
- CCIfType<[f32, f64], CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>,
- CCIfType<[f32], CCIfFixed<CCAssignToRegAndStack<[F0S, F2S, F4S, F6S], 4, 8>>>,
- CCIfType<[f64], CCIfFixed<CCAssignToRegAndStack<[F0D, F2D, F4D, F6D], 8, 8>>>,
+ CCIfType<[f32, f64], CCIfArgFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>,
+ CCIfType<[f32],
+ CCIfArgFixed<CCAssignToRegAndStack<[F0S, F2S, F4S, F6S], 4, 8>>>,
+ CCIfType<[f64],
+ CCIfArgFixed<CCAssignToRegAndStack<[F0D, F2D, F4D, F6D], 8, 8>>>,
// The first 2 long double arguments are passed in register FPR0/FPR2
// and FPR4/FPR6. The rest will be passed in the user area.
- CCIfType<[f128], CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>,
- CCIfType<[f128], CCIfFixed<CCAssignToRegAndStack<[F0Q, F4Q], 16, 8>>>,
+ CCIfType<[f128], CCIfArgFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>,
+ CCIfType<[f128], CCIfArgFixed<CCAssignToRegAndStack<[F0Q, F4Q], 16, 8>>>,
// Other arguments are passed in 8-byte-aligned 8-byte stack slots.
CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
diff --git a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
index d76babe..afe838a 100644
--- a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
@@ -181,8 +181,7 @@ static SDValue addIPMSequence(const SDLoc &DL, SDValue CCReg,
std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::EmitTargetCodeForMemcmp(
SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Src1,
- SDValue Src2, SDValue Size, MachinePointerInfo Op1PtrInfo,
- MachinePointerInfo Op2PtrInfo) const {
+ SDValue Src2, SDValue Size, const CallInst *CI) const {
SDValue CCReg;
// Swap operands to invert CC == 1 vs. CC == 2 cases.
if (auto *CSize = dyn_cast<ConstantSDNode>(Size)) {
diff --git a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
index c928f34..5a1e0cd 100644
--- a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
@@ -41,8 +41,7 @@ public:
std::pair<SDValue, SDValue>
EmitTargetCodeForMemcmp(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
SDValue Src1, SDValue Src2, SDValue Size,
- MachinePointerInfo Op1PtrInfo,
- MachinePointerInfo Op2PtrInfo) const override;
+ const CallInst *CI) const override;
std::pair<SDValue, SDValue>
EmitTargetCodeForMemchr(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index f32c9bd..2611c29 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -436,20 +436,6 @@ bool SystemZTTIImpl::isLSRCostLess(
C2.ScaleCost, C2.SetupCost);
}
-bool SystemZTTIImpl::areInlineCompatible(const Function *Caller,
- const Function *Callee) const {
- const TargetMachine &TM = getTLI()->getTargetMachine();
-
- const FeatureBitset &CallerBits =
- TM.getSubtargetImpl(*Caller)->getFeatureBits();
- const FeatureBitset &CalleeBits =
- TM.getSubtargetImpl(*Callee)->getFeatureBits();
-
- // Support only equal feature bitsets. Restriction should be relaxed in the
- // future to allow inlining when callee's bits are subset of the caller's.
- return CallerBits == CalleeBits;
-}
-
unsigned SystemZTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
bool Vector = (ClassID == 1);
if (!Vector)
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index dc5736e..fc681de 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -65,9 +65,6 @@ public:
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
const TargetTransformInfo::LSRCost &C2) const override;
- bool areInlineCompatible(const Function *Caller,
- const Function *Callee) const override;
-
/// @}
/// \name Vector TTI Implementations
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
index b03b350..fc852d0 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
@@ -136,6 +136,15 @@ static APInt encodeFunctionSignature(SelectionDAG *DAG, SDLoc &DL,
if (VT == MVT::f64) {
return wasm::ValType::F64;
}
+ if (VT == MVT::externref) {
+ return wasm::ValType::EXTERNREF;
+ }
+ if (VT == MVT::funcref) {
+ return wasm::ValType::FUNCREF;
+ }
+ if (VT == MVT::exnref) {
+ return wasm::ValType::EXNREF;
+ }
LLVM_DEBUG(errs() << "Unhandled type for llvm.wasm.ref.test.func: " << VT
<< "\n");
llvm_unreachable("Unhandled type for llvm.wasm.ref.test.func");
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 3f80b2a..f9eba4b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -1309,7 +1309,7 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
OutVal = FINode;
}
// Count the number of fixed args *after* legalization.
- NumFixedArgs += Out.IsFixed;
+ NumFixedArgs += !Out.Flags.isVarArg();
}
bool IsVarArg = CLI.IsVarArg;
@@ -1503,7 +1503,7 @@ SDValue WebAssemblyTargetLowering::LowerReturn(
for (const ISD::OutputArg &Out : Outs) {
assert(!Out.Flags.isByVal() && "byval is not valid for return values");
assert(!Out.Flags.isNest() && "nest is not valid for return values");
- assert(Out.IsFixed && "non-fixed return value is not valid");
+ assert(!Out.Flags.isVarArg() && "non-fixed return value is not valid");
if (Out.Flags.isInAlloca())
fail(DL, DAG, "WebAssembly hasn't implemented inalloca results");
if (Out.Flags.isInConsecutiveRegs())
diff --git a/llvm/lib/Target/X86/GISel/X86CallLowering.cpp b/llvm/lib/Target/X86/GISel/X86CallLowering.cpp
index c0a6035..d9f4405 100644
--- a/llvm/lib/Target/X86/GISel/X86CallLowering.cpp
+++ b/llvm/lib/Target/X86/GISel/X86CallLowering.cpp
@@ -75,7 +75,7 @@ public:
static const MCPhysReg XMMArgRegs[] = {X86::XMM0, X86::XMM1, X86::XMM2,
X86::XMM3, X86::XMM4, X86::XMM5,
X86::XMM6, X86::XMM7};
- if (!Info.IsFixed)
+ if (Flags.isVarArg())
NumXMMRegs = State.getFirstUnallocated(XMMArgRegs);
return Res;
@@ -363,7 +363,8 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
Info.CallConv, Info.IsVarArg))
return false;
- bool IsFixed = Info.OrigArgs.empty() ? true : Info.OrigArgs.back().IsFixed;
+ bool IsFixed =
+ Info.OrigArgs.empty() ? true : !Info.OrigArgs.back().Flags[0].isVarArg();
if (STI.is64Bit() && !IsFixed && !STI.isCallingConvWin64(Info.CallConv)) {
// From AMD64 ABI document:
// For calls that may call functions that use varargs or stdargs
diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index 3320508..b775c43 100644
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -1821,7 +1821,7 @@ static void sinkLifetimeStartMarkers(Function &F, coro::Shape &Shape,
// only used outside the region.
if (Valid && Lifetimes.size() != 0) {
auto *NewLifetime = Lifetimes[0]->clone();
- NewLifetime->replaceUsesOfWith(NewLifetime->getOperand(1), AI);
+ NewLifetime->replaceUsesOfWith(NewLifetime->getOperand(0), AI);
NewLifetime->insertBefore(DomBB->getTerminator()->getIterator());
// All the outsided lifetime.start markers are no longer necessary.
diff --git a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
index da60f52..6ed3b62 100644
--- a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
+++ b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
@@ -226,13 +226,6 @@ public:
/*IsVarArgs=*/false);
}
- static ConstantInt *sizeOfAlloca(LLVMContext &Ctx, const DataLayout &DL,
- AllocaInst *Alloced) {
- std::optional<TypeSize> AllocaTypeSize = Alloced->getAllocationSize(DL);
- uint64_t AsInt = AllocaTypeSize ? AllocaTypeSize->getFixedValue() : 0;
- return ConstantInt::get(Type::getInt64Ty(Ctx), AsInt);
- }
-
bool expansionApplicableToFunction(Module &M, Function *F) {
if (F->isIntrinsic() || !F->isVarArg() ||
F->hasFnAttribute(Attribute::Naked))
@@ -577,8 +570,7 @@ ExpandVariadics::defineVariadicWrapper(Module &M, IRBuilder<> &Builder,
AllocaInst *VaListInstance =
Builder.CreateAlloca(VaListTy, nullptr, "va_start");
- Builder.CreateLifetimeStart(VaListInstance,
- sizeOfAlloca(Ctx, DL, VaListInstance));
+ Builder.CreateLifetimeStart(VaListInstance);
Builder.CreateIntrinsic(Intrinsic::vastart, {DL.getAllocaPtrType(Ctx)},
{VaListInstance});
@@ -595,8 +587,7 @@ ExpandVariadics::defineVariadicWrapper(Module &M, IRBuilder<> &Builder,
Builder.CreateIntrinsic(Intrinsic::vaend, {DL.getAllocaPtrType(Ctx)},
{VaListInstance});
- Builder.CreateLifetimeEnd(VaListInstance,
- sizeOfAlloca(Ctx, DL, VaListInstance));
+ Builder.CreateLifetimeEnd(VaListInstance);
if (Result->getType()->isVoidTy())
Builder.CreateRetVoid();
@@ -746,7 +737,7 @@ bool ExpandVariadics::expandCall(Module &M, IRBuilder<> &Builder, CallBase *CB,
// Initialize the fields in the struct
Builder.SetInsertPoint(CB);
- Builder.CreateLifetimeStart(Alloced, sizeOfAlloca(Ctx, DL, Alloced));
+ Builder.CreateLifetimeStart(Alloced);
Frame.initializeStructAlloca(DL, Builder, Alloced);
const unsigned NumArgs = FuncType->getNumParams();
@@ -762,7 +753,7 @@ bool ExpandVariadics::expandCall(Module &M, IRBuilder<> &Builder, CallBase *CB,
Builder.SetCurrentDebugLocation(CB->getStableDebugLoc());
VaList = Builder.CreateAlloca(VaListTy, nullptr, "va_argument");
Builder.SetInsertPoint(CB);
- Builder.CreateLifetimeStart(VaList, sizeOfAlloca(Ctx, DL, VaList));
+ Builder.CreateLifetimeStart(VaList);
}
Builder.SetInsertPoint(CB);
Args.push_back(ABI->initializeVaList(M, Ctx, Builder, VaList, Alloced));
@@ -802,9 +793,9 @@ bool ExpandVariadics::expandCall(Module &M, IRBuilder<> &Builder, CallBase *CB,
}
if (VaList)
- Builder.CreateLifetimeEnd(VaList, sizeOfAlloca(Ctx, DL, VaList));
+ Builder.CreateLifetimeEnd(VaList);
- Builder.CreateLifetimeEnd(Alloced, sizeOfAlloca(Ctx, DL, Alloced));
+ Builder.CreateLifetimeEnd(Alloced);
NewCB->setAttributes(PAL);
NewCB->takeName(CB);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 47e017e..d7a2ef7 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1532,6 +1532,51 @@ static Instruction *foldBitOrderCrossLogicOp(Value *V,
return nullptr;
}
+/// Helper to match idempotent binary intrinsics, namely, intrinsics where
+/// `f(f(x, y), y) == f(x, y)` holds.
+static bool isIdempotentBinaryIntrinsic(Intrinsic::ID IID) {
+ switch (IID) {
+ case Intrinsic::smax:
+ case Intrinsic::smin:
+ case Intrinsic::umax:
+ case Intrinsic::umin:
+ case Intrinsic::maximum:
+ case Intrinsic::minimum:
+ case Intrinsic::maximumnum:
+ case Intrinsic::minimumnum:
+ case Intrinsic::maxnum:
+ case Intrinsic::minnum:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/// Attempt to simplify value-accumulating recurrences of kind:
+/// %umax.acc = phi i8 [ %umax, %backedge ], [ %a, %entry ]
+/// %umax = call i8 @llvm.umax.i8(i8 %umax.acc, i8 %b)
+/// And let the idempotent binary intrinsic be hoisted, when the operands are
+/// known to be loop-invariant.
+static Value *foldIdempotentBinaryIntrinsicRecurrence(InstCombinerImpl &IC,
+ IntrinsicInst *II) {
+ PHINode *PN;
+ Value *Init, *OtherOp;
+
+ // A binary intrinsic recurrence with loop-invariant operands is equivalent to
+ // `call @llvm.binary.intrinsic(Init, OtherOp)`.
+ auto IID = II->getIntrinsicID();
+ if (!isIdempotentBinaryIntrinsic(IID) ||
+ !matchSimpleBinaryIntrinsicRecurrence(II, PN, Init, OtherOp) ||
+ !IC.getDominatorTree().dominates(OtherOp, PN))
+ return nullptr;
+
+ auto *InvariantBinaryInst =
+ IC.Builder.CreateBinaryIntrinsic(IID, Init, OtherOp);
+ if (isa<FPMathOperator>(InvariantBinaryInst))
+ cast<Instruction>(InvariantBinaryInst)->copyFastMathFlags(II);
+ return InvariantBinaryInst;
+}
+
static Value *simplifyReductionOperand(Value *Arg, bool CanReorderLanes) {
if (!CanReorderLanes)
return nullptr;
@@ -3912,6 +3957,9 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
if (Value *Reverse = foldReversedIntrinsicOperands(II))
return replaceInstUsesWith(*II, Reverse);
+ if (Value *Res = foldIdempotentBinaryIntrinsicRecurrence(*this, II))
+ return replaceInstUsesWith(*II, Res);
+
// Some intrinsics (like experimental_gc_statepoint) can be used in invoke
// context, so it is handled in visitCallBase and we should trigger it.
return visitCallBase(*II);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index fe0f308..b17cf17 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -3042,7 +3042,7 @@ Instruction *InstCombinerImpl::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
Value *V = LHS;
unsigned MaskElems = Mask.size();
auto *SrcTy = cast<FixedVectorType>(V->getType());
- unsigned VecBitWidth = SrcTy->getPrimitiveSizeInBits().getFixedValue();
+ unsigned VecBitWidth = DL.getTypeSizeInBits(SrcTy);
unsigned SrcElemBitWidth = DL.getTypeSizeInBits(SrcTy->getElementType());
assert(SrcElemBitWidth && "vector elements must have a bitwidth");
unsigned SrcNumElems = SrcTy->getNumElements();
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 8da65c5..50258af 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -1211,23 +1211,19 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
return;
if (!II.isLifetimeStartOrEnd())
return;
- // Found lifetime intrinsic, add ASan instrumentation if necessary.
- auto *Size = cast<ConstantInt>(II.getArgOperand(0));
- // If size argument is undefined, don't do anything.
- if (Size->isMinusOne()) return;
- // Check that size doesn't saturate uint64_t and can
- // be stored in IntptrTy.
- const uint64_t SizeValue = Size->getValue().getLimitedValue();
- if (SizeValue == ~0ULL ||
- !ConstantInt::isValueValidForType(IntptrTy, SizeValue))
- return;
// Find alloca instruction that corresponds to llvm.lifetime argument.
- AllocaInst *AI = dyn_cast<AllocaInst>(II.getArgOperand(1));
+ AllocaInst *AI = dyn_cast<AllocaInst>(II.getArgOperand(0));
// We're interested only in allocas we can handle.
if (!AI || !ASan.isInterestingAlloca(*AI))
return;
+
+ std::optional<TypeSize> Size = AI->getAllocationSize(AI->getDataLayout());
+ // Check that size is known and can be stored in IntptrTy.
+ if (!Size || !ConstantInt::isValueValidForType(IntptrTy, *Size))
+ return;
+
bool DoPoison = (ID == Intrinsic::lifetime_end);
- AllocaPoisonCall APC = {&II, AI, SizeValue, DoPoison};
+ AllocaPoisonCall APC = {&II, AI, *Size, DoPoison};
if (AI->isStaticAlloca())
StaticAllocaPoisonCallVec.push_back(APC);
else if (ClInstrumentDynamicAllocas)
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index bcb90d6..fc34d14 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -1469,22 +1469,6 @@ void HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo,
size_t Size = memtag::getAllocaSizeInBytes(*AI);
size_t AlignedSize = alignTo(Size, Mapping.getObjectAlignment());
- auto HandleLifetime = [&](IntrinsicInst *II) {
- // Set the lifetime intrinsic to cover the whole alloca. This reduces the
- // set of assumptions we need to make about the lifetime. Without this we
- // would need to ensure that we can track the lifetime pointer to a
- // constant offset from the alloca, and would still need to change the
- // size to include the extra alignment we use for the untagging to make
- // the size consistent.
- //
- // The check for standard lifetime below makes sure that we have exactly
- // one set of start / end in any execution (i.e. the ends are not
- // reachable from each other), so this will not cause any problems.
- II->setArgOperand(0, ConstantInt::get(Int64Ty, AlignedSize));
- };
- llvm::for_each(Info.LifetimeStart, HandleLifetime);
- llvm::for_each(Info.LifetimeEnd, HandleLifetime);
-
AI->replaceUsesWithIf(Replacement, [AILong](const Use &U) {
auto *User = U.getUser();
return User != AILong && !isa<LifetimeIntrinsic>(User);
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 7d3c940..6e81387 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -3301,7 +3301,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
void handleLifetimeStart(IntrinsicInst &I) {
if (!PoisonStack)
return;
- AllocaInst *AI = dyn_cast<AllocaInst>(I.getArgOperand(1));
+ AllocaInst *AI = dyn_cast<AllocaInst>(I.getArgOperand(0));
if (AI)
LifetimeStartList.push_back(std::make_pair(&I, AI));
}
diff --git a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
index 46b5673..9471ae3 100644
--- a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
@@ -789,6 +789,13 @@ bool TypeSanitizer::instrumentMemInst(Value *V, Instruction *ShadowBase,
bool NeedsMemMove = false;
IRBuilder<> IRB(BB, IP);
+ auto GetAllocaSize = [&](AllocaInst *AI) {
+ return IRB.CreateMul(
+ IRB.CreateZExtOrTrunc(AI->getArraySize(), IntptrTy),
+ ConstantInt::get(IntptrTy,
+ DL.getTypeAllocSize(AI->getAllocatedType())));
+ };
+
if (auto *A = dyn_cast<Argument>(V)) {
assert(A->hasByValAttr() && "Type reset for non-byval argument?");
@@ -811,8 +818,12 @@ bool TypeSanitizer::instrumentMemInst(Value *V, Instruction *ShadowBase,
}
}
} else if (auto *II = dyn_cast<LifetimeIntrinsic>(I)) {
- Size = II->getArgOperand(0);
- Dest = II->getArgOperand(1);
+ auto *AI = dyn_cast<AllocaInst>(II->getArgOperand(0));
+ if (!AI)
+ return false;
+
+ Size = GetAllocaSize(AI);
+ Dest = II->getArgOperand(0);
} else if (auto *AI = dyn_cast<AllocaInst>(I)) {
// We need to clear the types for new stack allocations (or else we might
// read stale type information from a previous function execution).
@@ -820,10 +831,7 @@ bool TypeSanitizer::instrumentMemInst(Value *V, Instruction *ShadowBase,
IRB.SetInsertPoint(&*std::next(BasicBlock::iterator(I)));
IRB.SetInstDebugLocation(I);
- Size = IRB.CreateMul(
- IRB.CreateZExtOrTrunc(AI->getArraySize(), IntptrTy),
- ConstantInt::get(IntptrTy,
- DL.getTypeAllocSize(AI->getAllocatedType())));
+ Size = GetAllocaSize(AI);
Dest = I;
} else {
return false;
diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
index 938aab5..a7ba54f 100644
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -582,15 +582,17 @@ struct AllSwitchPaths {
VisitedBlocks VB;
// Get paths from the determinator BBs to SwitchPhiDefBB
std::vector<ThreadingPath> PathsToPhiDef =
- getPathsFromStateDefMap(StateDef, SwitchPhi, VB);
+ getPathsFromStateDefMap(StateDef, SwitchPhi, VB, MaxNumPaths);
if (SwitchPhiDefBB == SwitchBlock) {
TPaths = std::move(PathsToPhiDef);
return;
}
+ assert(MaxNumPaths >= PathsToPhiDef.size());
+ auto PathsLimit = MaxNumPaths / PathsToPhiDef.size();
// Find and append paths from SwitchPhiDefBB to SwitchBlock.
PathsType PathsToSwitchBB =
- paths(SwitchPhiDefBB, SwitchBlock, VB, /* PathDepth = */ 1);
+ paths(SwitchPhiDefBB, SwitchBlock, VB, /* PathDepth = */ 1, PathsLimit);
if (PathsToSwitchBB.empty())
return;
@@ -611,13 +613,16 @@ private:
typedef DenseMap<const BasicBlock *, const PHINode *> StateDefMap;
std::vector<ThreadingPath> getPathsFromStateDefMap(StateDefMap &StateDef,
PHINode *Phi,
- VisitedBlocks &VB) {
+ VisitedBlocks &VB,
+ unsigned PathsLimit) {
std::vector<ThreadingPath> Res;
auto *PhiBB = Phi->getParent();
VB.insert(PhiBB);
VisitedBlocks UniqueBlocks;
for (auto *IncomingBB : Phi->blocks()) {
+ if (Res.size() >= PathsLimit)
+ break;
if (!UniqueBlocks.insert(IncomingBB).second)
continue;
if (!SwitchOuterLoop->contains(IncomingBB))
@@ -653,8 +658,9 @@ private:
// Direct predecessor, just add to the path.
if (IncomingPhiDefBB == IncomingBB) {
- std::vector<ThreadingPath> PredPaths =
- getPathsFromStateDefMap(StateDef, IncomingPhi, VB);
+ assert(PathsLimit > Res.size());
+ std::vector<ThreadingPath> PredPaths = getPathsFromStateDefMap(
+ StateDef, IncomingPhi, VB, PathsLimit - Res.size());
for (ThreadingPath &Path : PredPaths) {
Path.push_back(PhiBB);
Res.push_back(std::move(Path));
@@ -667,13 +673,17 @@ private:
continue;
PathsType IntermediatePaths;
- IntermediatePaths =
- paths(IncomingPhiDefBB, IncomingBB, VB, /* PathDepth = */ 1);
+ assert(PathsLimit > Res.size());
+ auto InterPathLimit = PathsLimit - Res.size();
+ IntermediatePaths = paths(IncomingPhiDefBB, IncomingBB, VB,
+ /* PathDepth = */ 1, InterPathLimit);
if (IntermediatePaths.empty())
continue;
+ assert(InterPathLimit >= IntermediatePaths.size());
+ auto PredPathLimit = InterPathLimit / IntermediatePaths.size();
std::vector<ThreadingPath> PredPaths =
- getPathsFromStateDefMap(StateDef, IncomingPhi, VB);
+ getPathsFromStateDefMap(StateDef, IncomingPhi, VB, PredPathLimit);
for (const ThreadingPath &Path : PredPaths) {
for (const PathType &IPath : IntermediatePaths) {
ThreadingPath NewPath(Path);
@@ -688,7 +698,7 @@ private:
}
PathsType paths(BasicBlock *BB, BasicBlock *ToBB, VisitedBlocks &Visited,
- unsigned PathDepth) {
+ unsigned PathDepth, unsigned PathsLimit) {
PathsType Res;
// Stop exploring paths after visiting MaxPathLength blocks
@@ -715,6 +725,8 @@ private:
// is used to prevent a duplicate path from being generated
SmallSet<BasicBlock *, 4> Successors;
for (BasicBlock *Succ : successors(BB)) {
+ if (Res.size() >= PathsLimit)
+ break;
if (!Successors.insert(Succ).second)
continue;
@@ -736,14 +748,12 @@ private:
// coverage and compile time.
if (LI->getLoopFor(Succ) != CurrLoop)
continue;
-
- PathsType SuccPaths = paths(Succ, ToBB, Visited, PathDepth + 1);
+ assert(PathsLimit > Res.size());
+ PathsType SuccPaths =
+ paths(Succ, ToBB, Visited, PathDepth + 1, PathsLimit - Res.size());
for (PathType &Path : SuccPaths) {
Path.push_front(BB);
Res.push_back(Path);
- if (Res.size() >= MaxNumPaths) {
- return Res;
- }
}
}
// This block could now be visited again from a different predecessor. Note
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 9b87180..f46d54b 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -1363,7 +1363,7 @@ struct DSEState {
if (auto *CB = dyn_cast<CallBase>(I)) {
if (CB->getIntrinsicID() == Intrinsic::lifetime_end)
return {
- std::make_pair(MemoryLocation::getForArgument(CB, 1, &TLI), false)};
+ std::make_pair(MemoryLocation::getForArgument(CB, 0, &TLI), false)};
if (Value *FreedOp = getFreedOperand(CB, &TLI))
return {std::make_pair(MemoryLocation::getAfter(FreedOp), true)};
}
diff --git a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
index 85ee824..a097d33 100644
--- a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
+++ b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
@@ -434,7 +434,7 @@ bool InferAddressSpacesImpl::rewriteIntrinsicOperands(IntrinsicInst *II,
NewV = NewV->stripPointerCasts();
Function *NewDecl = Intrinsic::getOrInsertDeclaration(
M, II->getIntrinsicID(), {NewV->getType()});
- II->setArgOperand(1, NewV);
+ II->setArgOperand(0, NewV);
II->setCalledFunction(NewDecl);
return true;
}
@@ -491,7 +491,7 @@ void InferAddressSpacesImpl::collectRewritableIntrinsicOperands(
}
case Intrinsic::lifetime_start:
case Intrinsic::lifetime_end: {
- appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(1),
+ appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(0),
PostorderStack, Visited);
break;
}
diff --git a/llvm/lib/Transforms/Scalar/InferAlignment.cpp b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
index 0ddc231..e9bf59c 100644
--- a/llvm/lib/Transforms/Scalar/InferAlignment.cpp
+++ b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
@@ -58,14 +58,55 @@ bool inferAlignment(Function &F, AssumptionCache &AC, DominatorTree &DT) {
}
// Compute alignment from known bits.
+ auto InferFromKnownBits = [&](Instruction &I, Value *PtrOp) {
+ KnownBits Known = computeKnownBits(PtrOp, DL, &AC, &I, &DT);
+ unsigned TrailZ =
+ std::min(Known.countMinTrailingZeros(), +Value::MaxAlignmentExponent);
+ return Align(1ull << std::min(Known.getBitWidth() - 1, TrailZ));
+ };
+
+ // Propagate alignment between loads and stores that originate from the
+ // same base pointer.
+ DenseMap<Value *, Align> BestBasePointerAligns;
+ auto InferFromBasePointer = [&](Value *PtrOp, Align LoadStoreAlign) {
+ APInt OffsetFromBase(DL.getIndexTypeSizeInBits(PtrOp->getType()), 0);
+ PtrOp = PtrOp->stripAndAccumulateConstantOffsets(DL, OffsetFromBase, true);
+ // Derive the base pointer alignment from the load/store alignment
+ // and the offset from the base pointer.
+ Align BasePointerAlign =
+ commonAlignment(LoadStoreAlign, OffsetFromBase.getLimitedValue());
+
+ auto [It, Inserted] =
+ BestBasePointerAligns.try_emplace(PtrOp, BasePointerAlign);
+ if (!Inserted) {
+ // If the stored base pointer alignment is better than the
+ // base pointer alignment we derived, we may be able to use it
+ // to improve the load/store alignment. If not, store the
+ // improved base pointer alignment for future iterations.
+ if (It->second > BasePointerAlign) {
+ Align BetterLoadStoreAlign =
+ commonAlignment(It->second, OffsetFromBase.getLimitedValue());
+ return BetterLoadStoreAlign;
+ }
+ It->second = BasePointerAlign;
+ }
+ return LoadStoreAlign;
+ };
+
for (BasicBlock &BB : F) {
+ // We need to reset the map for each block because alignment information
+ // can only be propagated from instruction A to B if A dominates B.
+ // This is because control flow (and exception throwing) could be dependent
+ // on the address (and its alignment) at runtime. Some sort of dominator
+ // tree approach could be better, but doing a simple forward pass through a
+ // single basic block is correct too.
+ BestBasePointerAligns.clear();
+
for (Instruction &I : BB) {
Changed |= tryToImproveAlign(
DL, &I, [&](Value *PtrOp, Align OldAlign, Align PrefAlign) {
- KnownBits Known = computeKnownBits(PtrOp, DL, &AC, &I, &DT);
- unsigned TrailZ = std::min(Known.countMinTrailingZeros(),
- +Value::MaxAlignmentExponent);
- return Align(1ull << std::min(Known.getBitWidth() - 1, TrailZ));
+ return std::max(InferFromKnownBits(I, PtrOp),
+ InferFromBasePointer(PtrOp, OldAlign));
});
}
}
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index b3bffeb..c68149b 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -263,6 +263,7 @@ static bool isUniformShape(Value *V) {
case llvm::Instruction::FPExt:
return true;
case llvm::Instruction::AddrSpaceCast:
+ case CastInst::PtrToAddr:
case CastInst::PtrToInt:
case CastInst::IntToPtr:
return false;
@@ -2166,7 +2167,7 @@ public:
// If the loads don't alias the lifetime.end, it won't interfere with
// fusion.
- MemoryLocation EndLoc = MemoryLocation::getForArgument(End, 1, nullptr);
+ MemoryLocation EndLoc = MemoryLocation::getForArgument(End, 0, nullptr);
if (!EndLoc.Ptr)
continue;
if (AA->isNoAlias(Load0Loc, EndLoc) && AA->isNoAlias(Load1Loc, EndLoc))
diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 79721dc..f237322 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -915,7 +915,7 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
// move the bitcast as well, which we don't handle.
if (SkippedLifetimeStart) {
auto *LifetimeArg =
- dyn_cast<Instruction>(SkippedLifetimeStart->getOperand(1));
+ dyn_cast<Instruction>(SkippedLifetimeStart->getOperand(0));
if (LifetimeArg && LifetimeArg->getParent() == C->getParent() &&
C->comesBefore(LifetimeArg))
return false;
@@ -1010,7 +1010,7 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
// Lifetime of srcAlloca ends at lifetime.end.
if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
if (II->getIntrinsicID() == Intrinsic::lifetime_end &&
- II->getArgOperand(1) == srcAlloca)
+ II->getArgOperand(0) == srcAlloca)
break;
}
@@ -1393,7 +1393,7 @@ static bool hasUndefContents(MemorySSA *MSSA, BatchAAResults &AA, Value *V,
if (auto *II = dyn_cast_or_null<IntrinsicInst>(Def->getMemoryInst()))
if (II->getIntrinsicID() == Intrinsic::lifetime_start)
if (auto *Alloca = dyn_cast<AllocaInst>(getUnderlyingObject(V)))
- return II->getArgOperand(1) == Alloca;
+ return II->getArgOperand(0) == Alloca;
return false;
}
diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp
index 1a52af1..40eeeb2 100644
--- a/llvm/lib/Transforms/Scalar/NewGVN.cpp
+++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -1535,7 +1535,7 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
if (auto *II = dyn_cast<IntrinsicInst>(DepInst)) {
if (II->getIntrinsicID() == Intrinsic::lifetime_start) {
- auto *LifetimePtr = II->getOperand(1);
+ auto *LifetimePtr = II->getOperand(0);
if (LoadPtr == lookupOperandLeader(LifetimePtr) ||
AA->isMustAlias(LoadPtr, LifetimePtr))
return createConstantExpression(UndefValue::get(LoadType));
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 03d9f32..d6e27aa 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -1260,10 +1260,7 @@ private:
return PI.setAborted(&II);
if (II.isLifetimeStartOrEnd()) {
- ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0));
- uint64_t Size = std::min(AllocSize - Offset.getLimitedValue(),
- Length->getLimitedValue());
- insertUse(II, Offset, Size, true);
+ insertUse(II, Offset, AllocSize, true);
return;
}
@@ -3614,30 +3611,14 @@ private:
return true;
}
- assert(II.getArgOperand(1) == OldPtr);
- // Lifetime intrinsics are only promotable if they cover the whole alloca.
- // Therefore, we drop lifetime intrinsics which don't cover the whole
- // alloca.
- // (In theory, intrinsics which partially cover an alloca could be
- // promoted, but PromoteMemToReg doesn't handle that case.)
- // FIXME: Check whether the alloca is promotable before dropping the
- // lifetime intrinsics?
- if (NewBeginOffset != NewAllocaBeginOffset ||
- NewEndOffset != NewAllocaEndOffset)
- return true;
-
- ConstantInt *Size =
- ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()),
- NewEndOffset - NewBeginOffset);
- // Lifetime intrinsics always expect an i8* so directly get such a pointer
- // for the new alloca slice.
+ assert(II.getArgOperand(0) == OldPtr);
Type *PointerTy = IRB.getPtrTy(OldPtr->getType()->getPointerAddressSpace());
Value *Ptr = getNewAllocaSlicePtr(IRB, PointerTy);
Value *New;
if (II.getIntrinsicID() == Intrinsic::lifetime_start)
- New = IRB.CreateLifetimeStart(Ptr, Size);
+ New = IRB.CreateLifetimeStart(Ptr);
else
- New = IRB.CreateLifetimeEnd(Ptr, Size);
+ New = IRB.CreateLifetimeEnd(Ptr);
(void)New;
LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 7a9dd37..bbd1ed6 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -1099,7 +1099,7 @@ static void eraseLifetimeMarkersOnInputs(const SetVector<BasicBlock *> &Blocks,
// Get the memory operand of the lifetime marker. If the underlying
// object is a sunk alloca, or is otherwise defined in the extraction
// region, the lifetime marker must not be erased.
- Value *Mem = II->getOperand(1)->stripInBoundsOffsets();
+ Value *Mem = II->getOperand(0);
if (SunkAllocas.count(Mem) || definedInRegion(Blocks, Mem))
continue;
@@ -1115,8 +1115,6 @@ static void eraseLifetimeMarkersOnInputs(const SetVector<BasicBlock *> &Blocks,
static void insertLifetimeMarkersSurroundingCall(
Module *M, ArrayRef<Value *> LifetimesStart, ArrayRef<Value *> LifetimesEnd,
CallInst *TheCall) {
- LLVMContext &Ctx = M->getContext();
- auto NegativeOne = ConstantInt::getSigned(Type::getInt64Ty(Ctx), -1);
Instruction *Term = TheCall->getParent()->getTerminator();
// Emit lifetime markers for the pointers given in \p Objects. Insert the
@@ -1130,7 +1128,7 @@ static void insertLifetimeMarkersSurroundingCall(
Function *Func =
Intrinsic::getOrInsertDeclaration(M, MarkerFunc, Mem->getType());
- auto Marker = CallInst::Create(Func, {NegativeOne, Mem});
+ auto Marker = CallInst::Create(Func, Mem);
if (InsertBefore)
Marker->insertBefore(TheCall->getIterator());
else
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index ed3dca2..fa3c467 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -2361,15 +2361,13 @@ remapIndices(Function &Caller, BasicBlock *StartBB,
// Updating the contextual profile after an inlining means, at a high level,
// copying over the data of the callee, **intentionally without any value
// scaling**, and copying over the callees of the inlined callee.
-llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
- PGOContextualProfile &CtxProf,
- bool MergeAttributes,
- AAResults *CalleeAAR,
- bool InsertLifetime,
- Function *ForwardVarArgsTo) {
+llvm::InlineResult llvm::InlineFunction(
+ CallBase &CB, InlineFunctionInfo &IFI, PGOContextualProfile &CtxProf,
+ bool MergeAttributes, AAResults *CalleeAAR, bool InsertLifetime,
+ Function *ForwardVarArgsTo, OptimizationRemarkEmitter *ORE) {
if (!CtxProf.isInSpecializedModule())
return InlineFunction(CB, IFI, MergeAttributes, CalleeAAR, InsertLifetime,
- ForwardVarArgsTo);
+ ForwardVarArgsTo, ORE);
auto &Caller = *CB.getCaller();
auto &Callee = *CB.getCalledFunction();
@@ -2387,7 +2385,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
const auto NumCalleeCallsites = CtxProf.getNumCallsites(Callee);
auto Ret = InlineFunction(CB, IFI, MergeAttributes, CalleeAAR, InsertLifetime,
- ForwardVarArgsTo);
+ ForwardVarArgsTo, ORE);
if (!Ret.isSuccess())
return Ret;
@@ -2457,20 +2455,8 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
return Ret;
}
-/// This function inlines the called function into the basic block of the
-/// caller. This returns false if it is not possible to inline this call.
-/// The program is still in a well defined state if this occurs though.
-///
-/// Note that this only does one level of inlining. For example, if the
-/// instruction 'call B' is inlined, and 'B' calls 'C', then the call to 'C' now
-/// exists in the instruction stream. Similarly this will inline a recursive
-/// function by one level.
-llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
- bool MergeAttributes,
- AAResults *CalleeAAR,
- bool InsertLifetime,
- Function *ForwardVarArgsTo,
- OptimizationRemarkEmitter *ORE) {
+llvm::InlineResult llvm::CanInlineCallSite(const CallBase &CB,
+ InlineFunctionInfo &IFI) {
assert(CB.getParent() && CB.getFunction() && "Instruction not in function!");
// FIXME: we don't inline callbr yet.
@@ -2487,7 +2473,6 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
// The inliner does not know how to inline through calls with operand bundles
// in general ...
- Value *ConvergenceControlToken = nullptr;
if (CB.hasOperandBundles()) {
for (int i = 0, e = CB.getNumOperandBundles(); i != e; ++i) {
auto OBUse = CB.getOperandBundleAt(i);
@@ -2503,7 +2488,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
if (Tag == LLVMContext::OB_kcfi)
continue;
if (Tag == LLVMContext::OB_convergencectrl) {
- ConvergenceControlToken = OBUse.Inputs[0].get();
+ IFI.ConvergenceControlToken = OBUse.Inputs[0].get();
continue;
}
@@ -2521,28 +2506,22 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
// fully implements convergence control tokens, there is no mixing of
// controlled and uncontrolled convergent operations in the whole program.
if (CB.isConvergent()) {
- if (!ConvergenceControlToken &&
+ if (!IFI.ConvergenceControlToken &&
getConvergenceEntry(CalledFunc->getEntryBlock())) {
return InlineResult::failure(
"convergent call needs convergencectrl operand");
}
}
- // If the call to the callee cannot throw, set the 'nounwind' flag on any
- // calls that we inline.
- bool MarkNoUnwind = CB.doesNotThrow();
-
- BasicBlock *OrigBB = CB.getParent();
- Function *Caller = OrigBB->getParent();
+ const BasicBlock *OrigBB = CB.getParent();
+ const Function *Caller = OrigBB->getParent();
// GC poses two hazards to inlining, which only occur when the callee has GC:
// 1. If the caller has no GC, then the callee's GC must be propagated to the
// caller.
// 2. If the caller has a differing GC, it is invalid to inline.
if (CalledFunc->hasGC()) {
- if (!Caller->hasGC())
- Caller->setGC(CalledFunc->getGC());
- else if (CalledFunc->getGC() != Caller->getGC())
+ if (Caller->hasGC() && CalledFunc->getGC() != Caller->getGC())
return InlineResult::failure("incompatible GC");
}
@@ -2560,34 +2539,31 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
? Caller->getPersonalityFn()->stripPointerCasts()
: nullptr;
if (CalledPersonality) {
- if (!CallerPersonality)
- Caller->setPersonalityFn(CalledPersonality);
// If the personality functions match, then we can perform the
// inlining. Otherwise, we can't inline.
// TODO: This isn't 100% true. Some personality functions are proper
// supersets of others and can be used in place of the other.
- else if (CalledPersonality != CallerPersonality)
+ if (CallerPersonality && CalledPersonality != CallerPersonality)
return InlineResult::failure("incompatible personality");
}
// We need to figure out which funclet the callsite was in so that we may
// properly nest the callee.
- Instruction *CallSiteEHPad = nullptr;
if (CallerPersonality) {
EHPersonality Personality = classifyEHPersonality(CallerPersonality);
if (isScopedEHPersonality(Personality)) {
std::optional<OperandBundleUse> ParentFunclet =
CB.getOperandBundle(LLVMContext::OB_funclet);
if (ParentFunclet)
- CallSiteEHPad = cast<FuncletPadInst>(ParentFunclet->Inputs.front());
+ IFI.CallSiteEHPad = cast<FuncletPadInst>(ParentFunclet->Inputs.front());
// OK, the inlining site is legal. What about the target function?
- if (CallSiteEHPad) {
+ if (IFI.CallSiteEHPad) {
if (Personality == EHPersonality::MSVC_CXX) {
// The MSVC personality cannot tolerate catches getting inlined into
// cleanup funclets.
- if (isa<CleanupPadInst>(CallSiteEHPad)) {
+ if (isa<CleanupPadInst>(IFI.CallSiteEHPad)) {
// Ok, the call site is within a cleanuppad. Let's check the callee
// for catchpads.
for (const BasicBlock &CalledBB : *CalledFunc) {
@@ -2607,13 +2583,34 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
}
}
+ return InlineResult::success();
+}
+
+/// This function inlines the called function into the basic block of the
+/// caller. This returns false if it is not possible to inline this call.
+/// The program is still in a well defined state if this occurs though.
+///
+/// Note that this only does one level of inlining. For example, if the
+/// instruction 'call B' is inlined, and 'B' calls 'C', then the call to 'C' now
+/// exists in the instruction stream. Similarly this will inline a recursive
+/// function by one level.
+void llvm::InlineFunctionImpl(CallBase &CB, InlineFunctionInfo &IFI,
+ bool MergeAttributes, AAResults *CalleeAAR,
+ bool InsertLifetime, Function *ForwardVarArgsTo,
+ OptimizationRemarkEmitter *ORE) {
+ BasicBlock *OrigBB = CB.getParent();
+ Function *Caller = OrigBB->getParent();
+ Function *CalledFunc = CB.getCalledFunction();
+ assert(CalledFunc && !CalledFunc->isDeclaration() &&
+ "CanInlineCallSite should have verified direct call to definition");
+
// Determine if we are dealing with a call in an EHPad which does not unwind
// to caller.
bool EHPadForCallUnwindsLocally = false;
- if (CallSiteEHPad && isa<CallInst>(CB)) {
+ if (IFI.CallSiteEHPad && isa<CallInst>(CB)) {
UnwindDestMemoTy FuncletUnwindMap;
Value *CallSiteUnwindDestToken =
- getUnwindDestToken(CallSiteEHPad, FuncletUnwindMap);
+ getUnwindDestToken(IFI.CallSiteEHPad, FuncletUnwindMap);
EHPadForCallUnwindsLocally =
CallSiteUnwindDestToken &&
@@ -2630,6 +2627,30 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
ClonedCodeInfo InlinedFunctionInfo;
Function::iterator FirstNewBlock;
+ // GC poses two hazards to inlining, which only occur when the callee has GC:
+ // 1. If the caller has no GC, then the callee's GC must be propagated to the
+ // caller.
+ // 2. If the caller has a differing GC, it is invalid to inline.
+ if (CalledFunc->hasGC()) {
+ if (!Caller->hasGC())
+ Caller->setGC(CalledFunc->getGC());
+ else {
+ assert(CalledFunc->getGC() == Caller->getGC() &&
+ "CanInlineCallSite should have verified compatible GCs");
+ }
+ }
+
+ if (CalledFunc->hasPersonalityFn()) {
+ Constant *CalledPersonality =
+ CalledFunc->getPersonalityFn()->stripPointerCasts();
+ if (!Caller->hasPersonalityFn()) {
+ Caller->setPersonalityFn(CalledPersonality);
+ } else
+ assert(Caller->getPersonalityFn()->stripPointerCasts() ==
+ CalledPersonality &&
+ "CanInlineCallSite should have verified compatible personality");
+ }
+
{ // Scope to destroy VMap after cloning.
ValueToValueMapTy VMap;
struct ByValInit {
@@ -2819,10 +2840,10 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
IFI.GetAssumptionCache(*Caller).registerAssumption(II);
}
- if (ConvergenceControlToken) {
+ if (IFI.ConvergenceControlToken) {
IntrinsicInst *IntrinsicCall = getConvergenceEntry(*FirstNewBlock);
if (IntrinsicCall) {
- IntrinsicCall->replaceAllUsesWith(ConvergenceControlToken);
+ IntrinsicCall->replaceAllUsesWith(IFI.ConvergenceControlToken);
IntrinsicCall->eraseFromParent();
}
}
@@ -2869,6 +2890,10 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
}
}
+ // If the call to the callee cannot throw, set the 'nounwind' flag on any
+ // calls that we inline.
+ bool MarkNoUnwind = CB.doesNotThrow();
+
SmallVector<Value*,4> VarArgsToForward;
SmallVector<AttributeSet, 4> VarArgsAttrs;
for (unsigned i = CalledFunc->getFunctionType()->getNumParams();
@@ -2979,31 +3004,11 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
if (hasLifetimeMarkers(AI))
continue;
- // Try to determine the size of the allocation.
- ConstantInt *AllocaSize = nullptr;
- if (ConstantInt *AIArraySize =
- dyn_cast<ConstantInt>(AI->getArraySize())) {
- auto &DL = Caller->getDataLayout();
- Type *AllocaType = AI->getAllocatedType();
- TypeSize AllocaTypeSize = DL.getTypeAllocSize(AllocaType);
- uint64_t AllocaArraySize = AIArraySize->getLimitedValue();
-
- // Don't add markers for zero-sized allocas.
- if (AllocaArraySize == 0)
- continue;
-
- // Check that array size doesn't saturate uint64_t and doesn't
- // overflow when it's multiplied by type size.
- if (!AllocaTypeSize.isScalable() &&
- AllocaArraySize != std::numeric_limits<uint64_t>::max() &&
- std::numeric_limits<uint64_t>::max() / AllocaArraySize >=
- AllocaTypeSize.getFixedValue()) {
- AllocaSize = ConstantInt::get(Type::getInt64Ty(AI->getContext()),
- AllocaArraySize * AllocaTypeSize);
- }
- }
+ std::optional<TypeSize> Size = AI->getAllocationSize(AI->getDataLayout());
+ if (Size && Size->isZero())
+ continue;
- builder.CreateLifetimeStart(AI, AllocaSize);
+ builder.CreateLifetimeStart(AI);
for (ReturnInst *RI : Returns) {
// Don't insert llvm.lifetime.end calls between a musttail or deoptimize
// call and a return. The return kills all local allocas.
@@ -3013,7 +3018,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
if (InlinedDeoptimizeCalls &&
RI->getParent()->getTerminatingDeoptimizeCall())
continue;
- IRBuilder<>(RI).CreateLifetimeEnd(AI, AllocaSize);
+ IRBuilder<>(RI).CreateLifetimeEnd(AI);
}
}
}
@@ -3055,12 +3060,12 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
// Update the lexical scopes of the new funclets and callsites.
// Anything that had 'none' as its parent is now nested inside the callsite's
// EHPad.
- if (CallSiteEHPad) {
+ if (IFI.CallSiteEHPad) {
for (Function::iterator BB = FirstNewBlock->getIterator(),
E = Caller->end();
BB != E; ++BB) {
// Add bundle operands to inlined call sites.
- PropagateOperandBundles(BB, CallSiteEHPad);
+ PropagateOperandBundles(BB, IFI.CallSiteEHPad);
// It is problematic if the inlinee has a cleanupret which unwinds to
// caller and we inline it into a call site which doesn't unwind but into
@@ -3076,11 +3081,11 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(I)) {
if (isa<ConstantTokenNone>(CatchSwitch->getParentPad()))
- CatchSwitch->setParentPad(CallSiteEHPad);
+ CatchSwitch->setParentPad(IFI.CallSiteEHPad);
} else {
auto *FPI = cast<FuncletPadInst>(I);
if (isa<ConstantTokenNone>(FPI->getParentPad()))
- FPI->setParentPad(CallSiteEHPad);
+ FPI->setParentPad(IFI.CallSiteEHPad);
}
}
}
@@ -3236,7 +3241,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
AttributeFuncs::mergeAttributesForInlining(*Caller, *CalledFunc);
// We are now done with the inlining.
- return InlineResult::success();
+ return;
}
// Otherwise, we have the normal case, of more than one block to inline or
@@ -3404,6 +3409,19 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
if (MergeAttributes)
AttributeFuncs::mergeAttributesForInlining(*Caller, *CalledFunc);
+}
- return InlineResult::success();
+llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
+ bool MergeAttributes,
+ AAResults *CalleeAAR,
+ bool InsertLifetime,
+ Function *ForwardVarArgsTo,
+ OptimizationRemarkEmitter *ORE) {
+ llvm::InlineResult Result = CanInlineCallSite(CB, IFI);
+ if (Result.isSuccess()) {
+ InlineFunctionImpl(CB, IFI, MergeAttributes, CalleeAAR, InsertLifetime,
+ ForwardVarArgsTo, ORE);
+ }
+
+ return Result;
}
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 2619e73..b559212 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -481,7 +481,7 @@ bool llvm::wouldInstructionBeTriviallyDead(const Instruction *I,
return true;
if (II->isLifetimeStartOrEnd()) {
- auto *Arg = II->getArgOperand(1);
+ auto *Arg = II->getArgOperand(0);
if (isa<PoisonValue>(Arg))
return true;
diff --git a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
index 472c03f..1f59b17 100644
--- a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
+++ b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
@@ -155,7 +155,7 @@ void StackInfoBuilder::visit(OptimizationRemarkEmitter &ORE,
return;
}
if (auto *II = dyn_cast<LifetimeIntrinsic>(&Inst)) {
- AllocaInst *AI = dyn_cast<AllocaInst>(II->getArgOperand(1));
+ AllocaInst *AI = dyn_cast<AllocaInst>(II->getArgOperand(0));
if (!AI ||
getAllocaInterestingness(*AI) != AllocaInterestingness::kInteresting)
return;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 9667b50..1ac84ef 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -548,9 +548,6 @@ public:
protected:
friend class LoopVectorizationPlanner;
- /// Returns (and creates if needed) the trip count of the widened loop.
- Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
-
// Create a check to see if the vector loop should be executed
Value *createIterationCountCheck(ElementCount VF, unsigned UF) const;
@@ -2272,56 +2269,6 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
return TTI.enableMaskedInterleavedAccessVectorization();
}
-Value *
-InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
- if (VectorTripCount)
- return VectorTripCount;
-
- Value *TC = getTripCount();
- IRBuilder<> Builder(InsertBlock->getTerminator());
-
- Type *Ty = TC->getType();
- // This is where we can make the step a runtime constant.
- Value *Step = createStepForVF(Builder, Ty, VF, UF);
-
- // If the tail is to be folded by masking, round the number of iterations N
- // up to a multiple of Step instead of rounding down. This is done by first
- // adding Step-1 and then rounding down. Note that it's ok if this addition
- // overflows: the vector induction variable will eventually wrap to zero given
- // that it starts at zero and its Step is a power of two; the loop will then
- // exit, with the last early-exit vector comparison also producing all-true.
- // For scalable vectors the VF is not guaranteed to be a power of 2, but this
- // is accounted for in emitIterationCountCheck that adds an overflow check.
- if (Cost->foldTailByMasking()) {
- assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
- "VF*UF must be a power of 2 when folding tail by masking");
- TC = Builder.CreateAdd(TC, Builder.CreateSub(Step, ConstantInt::get(Ty, 1)),
- "n.rnd.up");
- }
-
- // Now we need to generate the expression for the part of the loop that the
- // vectorized body will execute. This is equal to N - (N % Step) if scalar
- // iterations are not required for correctness, or N - Step, otherwise. Step
- // is equal to the vectorization factor (number of SIMD elements) times the
- // unroll factor (number of SIMD instructions).
- Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
-
- // There are cases where we *must* run at least one iteration in the remainder
- // loop. See the cost model for when this can happen. If the step evenly
- // divides the trip count, we set the remainder to be equal to the step. If
- // the step does not evenly divide the trip count, no adjustment is necessary
- // since there will already be scalar iterations. Note that the minimum
- // iterations check ensures that N >= Step.
- if (Cost->requiresScalarEpilogue(VF.isVector())) {
- auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
- R = Builder.CreateSelect(IsZero, Step, R);
- }
-
- VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
-
- return VectorTripCount;
-}
-
void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) {
// Note: The block with the minimum trip-count check is already connected
// during earlier VPlan construction.
@@ -7354,6 +7301,9 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
// Canonicalize EVL loops after regions are dissolved.
VPlanTransforms::canonicalizeEVLLoops(BestVPlan);
VPlanTransforms::materializeBackedgeTakenCount(BestVPlan, VectorPH);
+ VPlanTransforms::materializeVectorTripCount(
+ BestVPlan, VectorPH, CM.foldTailByMasking(),
+ CM.requiresScalarEpilogue(BestVF.isVector()));
// Perform the actual loop transformation.
VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan,
@@ -7410,8 +7360,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
//===------------------------------------------------===//
// 2. Copy and widen instructions from the old loop into the new loop.
- BestVPlan.prepareToExecute(
- ILV.getOrCreateVectorTripCount(ILV.LoopVectorPreHeader), State);
+ BestVPlan.prepareToExecute(State);
replaceVPBBWithIRVPBB(VectorPH, State.CFG.PrevBB);
// Move check blocks to their final position.
@@ -7530,9 +7479,6 @@ BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
EPI.MainLoopIterationCountCheck =
emitIterationCountCheck(LoopScalarPreHeader, false);
- // Generate the induction variable.
- EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
-
replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader);
return LoopVectorPreHeader;
}
@@ -8451,11 +8397,9 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
*Plan, CM.getMinimalBitwidths());
VPlanTransforms::runPass(VPlanTransforms::optimize, *Plan);
// TODO: try to put it close to addActiveLaneMask().
- // Discard the plan if it is not EVL-compatible
- if (CM.foldTailWithEVL() && !HasScalarVF &&
- !VPlanTransforms::runPass(VPlanTransforms::tryAddExplicitVectorLength,
- *Plan, CM.getMaxSafeElements()))
- break;
+ if (CM.foldTailWithEVL() && !HasScalarVF)
+ VPlanTransforms::runPass(VPlanTransforms::addExplicitVectorLength,
+ *Plan, CM.getMaxSafeElements());
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
VPlans.push_back(std::move(Plan));
}
@@ -9412,13 +9356,6 @@ void VPDerivedIVRecipe::execute(VPTransformState &State) {
State.Builder, Index, getStartValue()->getLiveInIRValue(), Step, Kind,
cast_if_present<BinaryOperator>(FPBinOp));
DerivedIV->setName(Name);
- // If index is the vector trip count, the concrete value will only be set in
- // prepareToExecute, leading to missed simplifications, e.g. if it is 0.
- // TODO: Remove the special case for the vector trip count once it is computed
- // in VPlan and can be used during VPlan simplification.
- assert((DerivedIV != Index ||
- getOperand(1) == &getParent()->getPlan()->getVectorTripCount()) &&
- "IV didn't need transforming?");
State.set(this, DerivedIV, VPLane(0));
}
@@ -9810,7 +9747,7 @@ static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
static void
preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
const SCEV2ValueTy &ExpandedSCEVs,
- const EpilogueLoopVectorizationInfo &EPI) {
+ EpilogueLoopVectorizationInfo &EPI) {
VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
Header->setName("vec.epilog.vector.body");
@@ -9828,30 +9765,13 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
// loop.
using namespace llvm::PatternMatch;
PHINode *EPResumeVal = &*L->getLoopPreheader()->phis().begin();
- assert(EPResumeVal->getType() == IV->getScalarType() &&
- match(EPResumeVal->getIncomingValueForBlock(
- EPI.MainLoopIterationCountCheck),
- m_SpecificInt(0)) &&
- EPResumeVal ==
- find_singleton<PHINode>(
- L->getLoopPreheader()->phis(),
- [&EPI, IV](PHINode &P, bool) -> PHINode * {
- if (P.getType() == IV->getScalarType() &&
- match(P.getIncomingValueForBlock(
- EPI.MainLoopIterationCountCheck),
- m_SpecificInt(0)) &&
- any_of(P.incoming_values(),
- [&EPI](Value *Inc) {
- return Inc == EPI.VectorTripCount;
- }) &&
- all_of(P.incoming_values(), [&EPI](Value *Inc) {
- return Inc == EPI.VectorTripCount ||
- match(Inc, m_SpecificInt(0));
- }))
- return &P;
- return nullptr;
- }) &&
- "Epilogue resume phis do not match!");
+ for (Value *Inc : EPResumeVal->incoming_values()) {
+ if (match(Inc, m_SpecificInt(0)))
+ continue;
+ assert(!EPI.VectorTripCount &&
+ "Must only have a single non-zero incoming value");
+ EPI.VectorTripCount = Inc;
+ }
VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal);
assert(all_of(IV->users(),
[](const VPUser *U) {
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 5d0e2f9..ec06a21 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3883,6 +3883,7 @@ private:
enum CombinedOpcode {
NotCombinedOp = -1,
MinMax = Instruction::OtherOpsEnd + 1,
+ FMulAdd,
};
CombinedOpcode CombinedOp = NotCombinedOp;
@@ -4033,6 +4034,9 @@ private:
/// Returns true if any scalar in the list is a copyable element.
bool hasCopyableElements() const { return !CopyableElements.empty(); }
+ /// Returns the state of the operations.
+ const InstructionsState &getOperations() const { return S; }
+
/// When ReuseReorderShuffleIndices is empty it just returns position of \p
/// V within vector of Scalars. Otherwise, try to remap on its reuse index.
unsigned findLaneForValue(Value *V) const {
@@ -11987,6 +11991,82 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
}
}
+static InstructionCost canConvertToFMA(ArrayRef<Value *> VL,
+ const InstructionsState &S,
+ DominatorTree &DT, const DataLayout &DL,
+ TargetTransformInfo &TTI,
+ const TargetLibraryInfo &TLI) {
+ assert(all_of(VL,
+ [](Value *V) {
+ return V->getType()->getScalarType()->isFloatingPointTy();
+ }) &&
+ "Can only convert to FMA for floating point types");
+ assert(S.isAddSubLikeOp() && "Can only convert to FMA for add/sub");
+
+ auto CheckForContractable = [&](ArrayRef<Value *> VL) {
+ FastMathFlags FMF;
+ FMF.set();
+ for (Value *V : VL) {
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ continue;
+ // TODO: support for copyable elements.
+ Instruction *MatchingI = S.getMatchingMainOpOrAltOp(I);
+ if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
+ continue;
+ if (auto *FPCI = dyn_cast<FPMathOperator>(I))
+ FMF &= FPCI->getFastMathFlags();
+ }
+ return FMF.allowContract();
+ };
+ if (!CheckForContractable(VL))
+ return InstructionCost::getInvalid();
+ // fmul also should be contractable
+ InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
+ SmallVector<BoUpSLP::ValueList> Operands = Analysis.buildOperands(S, VL);
+
+ InstructionsState OpS = getSameOpcode(Operands.front(), TLI);
+ if (!OpS.valid())
+ return InstructionCost::getInvalid();
+ if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
+ return InstructionCost::getInvalid();
+ if (!CheckForContractable(Operands.front()))
+ return InstructionCost::getInvalid();
+ // Compare the costs.
+ InstructionCost FMulPlusFAddCost = 0;
+ InstructionCost FMACost = 0;
+ constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ FastMathFlags FMF;
+ FMF.set();
+ for (Value *V : VL) {
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ continue;
+ if (auto *FPCI = dyn_cast<FPMathOperator>(I))
+ FMF &= FPCI->getFastMathFlags();
+ FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
+ }
+ unsigned NumOps = 0;
+ for (auto [V, Op] : zip(VL, Operands.front())) {
+ auto *I = dyn_cast<Instruction>(Op);
+ if (!I || !I->hasOneUse()) {
+ if (auto *OpI = dyn_cast<Instruction>(V))
+ FMACost += TTI.getInstructionCost(OpI, CostKind);
+ if (I)
+ FMACost += TTI.getInstructionCost(I, CostKind);
+ continue;
+ }
+ ++NumOps;
+ if (auto *FPCI = dyn_cast<FPMathOperator>(I))
+ FMF &= FPCI->getFastMathFlags();
+ FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
+ }
+ Type *Ty = VL.front()->getType();
+ IntrinsicCostAttributes ICA(Intrinsic::fmuladd, Ty, {Ty, Ty, Ty}, FMF);
+ FMACost += NumOps * TTI.getIntrinsicInstrCost(ICA, CostKind);
+ return FMACost < FMulPlusFAddCost ? FMACost : InstructionCost::getInvalid();
+}
+
void BoUpSLP::transformNodes() {
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
BaseGraphSize = VectorizableTree.size();
@@ -12355,6 +12435,25 @@ void BoUpSLP::transformNodes() {
}
break;
}
+ case Instruction::FSub:
+ case Instruction::FAdd: {
+ // Check if possible to convert (a*b)+c to fma.
+ if (E.State != TreeEntry::Vectorize ||
+ !E.getOperations().isAddSubLikeOp())
+ break;
+ if (!canConvertToFMA(E.Scalars, E.getOperations(), *DT, *DL, *TTI, *TLI)
+ .isValid())
+ break;
+ // This node is a fmuladd node.
+ E.CombinedOp = TreeEntry::FMulAdd;
+ TreeEntry *FMulEntry = getOperandEntry(&E, 0);
+ if (FMulEntry->UserTreeIndex &&
+ FMulEntry->State == TreeEntry::Vectorize) {
+ // The FMul node is part of the combined fmuladd node.
+ FMulEntry->State = TreeEntry::CombinedVectorize;
+ }
+ break;
+ }
default:
break;
}
@@ -13587,6 +13686,11 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
}
return IntrinsicCost;
};
+ auto GetFMulAddCost = [&, &TTI = *TTI](const InstructionsState &S,
+ Instruction *VI) {
+ InstructionCost Cost = canConvertToFMA(VI, S, *DT, *DL, TTI, *TLI);
+ return Cost;
+ };
switch (ShuffleOrOp) {
case Instruction::PHI: {
// Count reused scalars.
@@ -13927,6 +14031,30 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
};
return GetCostDiff(GetScalarCost, GetVectorCost);
}
+ case TreeEntry::FMulAdd: {
+ auto GetScalarCost = [&](unsigned Idx) {
+ if (isa<PoisonValue>(UniqueValues[Idx]))
+ return InstructionCost(TTI::TCC_Free);
+ return GetFMulAddCost(E->getOperations(),
+ cast<Instruction>(UniqueValues[Idx]));
+ };
+ auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
+ FastMathFlags FMF;
+ FMF.set();
+ for (Value *V : E->Scalars) {
+ if (auto *FPCI = dyn_cast<FPMathOperator>(V)) {
+ FMF &= FPCI->getFastMathFlags();
+ if (auto *FPCIOp = dyn_cast<FPMathOperator>(FPCI->getOperand(0)))
+ FMF &= FPCIOp->getFastMathFlags();
+ }
+ }
+ IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
+ {VecTy, VecTy, VecTy}, FMF);
+ InstructionCost VecCost = TTI.getIntrinsicInstrCost(ICA, CostKind);
+ return VecCost + CommonCost;
+ };
+ return GetCostDiff(GetScalarCost, GetVectorCost);
+ }
case Instruction::FNeg:
case Instruction::Add:
case Instruction::FAdd:
@@ -13964,8 +14092,16 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
}
TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(Op1);
TTI::OperandValueInfo Op2Info = TTI::getOperandInfo(Op2);
- return TTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy, CostKind,
- Op1Info, Op2Info, Operands);
+ InstructionCost ScalarCost = TTI->getArithmeticInstrCost(
+ ShuffleOrOp, OrigScalarTy, CostKind, Op1Info, Op2Info, Operands);
+ if (auto *I = dyn_cast<Instruction>(UniqueValues[Idx]);
+ I && (ShuffleOrOp == Instruction::FAdd ||
+ ShuffleOrOp == Instruction::FSub)) {
+ InstructionCost IntrinsicCost = GetFMulAddCost(E->getOperations(), I);
+ if (IntrinsicCost.isValid())
+ ScalarCost = IntrinsicCost;
+ }
+ return ScalarCost;
};
auto GetVectorCost = [=](InstructionCost CommonCost) {
if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
@@ -22594,11 +22730,21 @@ public:
/// Try to find a reduction tree.
bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
ScalarEvolution &SE, const DataLayout &DL,
- const TargetLibraryInfo &TLI) {
+ const TargetLibraryInfo &TLI,
+ DominatorTree &DT, TargetTransformInfo &TTI) {
RdxKind = HorizontalReduction::getRdxKind(Root);
if (!isVectorizable(RdxKind, Root))
return false;
+ // FMA reduction root - skip.
+ auto CheckForFMA = [&](Instruction *I) {
+ return RdxKind == RecurKind::FAdd &&
+ canConvertToFMA(I, getSameOpcode(I, TLI), DT, DL, TTI, TLI)
+ .isValid();
+ };
+ if (CheckForFMA(Root))
+ return false;
+
// Analyze "regular" integer/FP types for reductions - no target-specific
// types or pointers.
Type *Ty = Root->getType();
@@ -22636,7 +22782,7 @@ public:
// Also, do not try to reduce const values, if the operation is not
// foldable.
if (!EdgeInst || Level > RecursionMaxDepth ||
- getRdxKind(EdgeInst) != RdxKind ||
+ getRdxKind(EdgeInst) != RdxKind || CheckForFMA(EdgeInst) ||
IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
!hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
!isVectorizable(RdxKind, EdgeInst) ||
@@ -24205,13 +24351,13 @@ bool SLPVectorizerPass::vectorizeHorReduction(
Stack.emplace(SelectRoot(), 0);
SmallPtrSet<Value *, 8> VisitedInstrs;
bool Res = false;
- auto &&TryToReduce = [this, &R](Instruction *Inst) -> Value * {
+ auto TryToReduce = [this, &R, TTI = TTI](Instruction *Inst) -> Value * {
if (R.isAnalyzedReductionRoot(Inst))
return nullptr;
if (!isReductionCandidate(Inst))
return nullptr;
HorizontalReduction HorRdx;
- if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
+ if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI, *DT, *TTI))
return nullptr;
return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC);
};
@@ -24277,6 +24423,12 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
return false;
+ // Skip potential FMA candidates.
+ if ((I->getOpcode() == Instruction::FAdd ||
+ I->getOpcode() == Instruction::FSub) &&
+ canConvertToFMA(I, getSameOpcode(I, *TLI), *DT, *DL, *TTI, *TLI)
+ .isValid())
+ return false;
Value *P = I->getParent();
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp
index f32d57f..e414c12 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp
@@ -81,6 +81,7 @@ LegalityAnalysis::notVectorizableBasedOnOpcodesAndTypes(
case Instruction::Opcode::FPToUI:
case Instruction::Opcode::FPToSI:
case Instruction::Opcode::FPExt:
+ case Instruction::Opcode::PtrToAddr:
case Instruction::Opcode::PtrToInt:
case Instruction::Opcode::IntToPtr:
case Instruction::Opcode::SIToFP:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 8052e31..a820b524 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -951,15 +951,9 @@ VPlan::~VPlan() {
delete BackedgeTakenCount;
}
-void VPlan::prepareToExecute(Value *VectorTripCountV, VPTransformState &State) {
- if (!VectorTripCount.getUnderlyingValue())
- VectorTripCount.setUnderlyingValue(VectorTripCountV);
- else
- assert(VectorTripCount.getUnderlyingValue() == VectorTripCountV &&
- "VectorTripCount set earlier must much VectorTripCountV");
-
+void VPlan::prepareToExecute(VPTransformState &State) {
IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
- Type *TCTy = VectorTripCountV->getType();
+ Type *TCTy = VPTypeAnalysis(*this).inferScalarType(getTripCount());
// FIXME: Model VF * UF computation completely in VPlan.
unsigned UF = getUF();
if (VF.getNumUsers()) {
@@ -1054,12 +1048,17 @@ void VPlan::execute(VPTransformState *State) {
InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx) {
// For now only return the cost of the vector loop region, ignoring any other
- // blocks, like the preheader or middle blocks.
+ // blocks, like the preheader or middle blocks, expect for checking them for
+ // recipes with invalid costs.
InstructionCost Cost = getVectorLoopRegion()->cost(VF, Ctx);
- // If any instructions in the middle block are invalid return invalid.
- // TODO: Remove once no VPlans with VF == vscale x 1 and first-order recurrences are created.
- if (!getMiddleBlock()->cost(VF, Ctx).isValid())
+ // If the cost of the loop region is invalid or any recipe in the skeleton
+ // outside loop regions are invalid return an invalid cost.
+ if (!Cost.isValid() || any_of(VPBlockUtils::blocksOnly<VPBasicBlock>(
+ vp_depth_first_shallow(getEntry())),
+ [&VF, &Ctx](VPBasicBlock *VPBB) {
+ return !VPBB->cost(VF, Ctx).isValid();
+ }))
return InstructionCost::getInvalid();
return Cost;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index c42cdd5..6f098351 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2408,11 +2408,11 @@ public:
// TODO: extend the masked interleaved-group support to reversed access.
assert((!Mask || !IG->isReverse()) &&
"Reversed masked interleave-group not supported.");
- for (unsigned i = 0; i < IG->getFactor(); ++i)
- if (Instruction *I = IG->getMember(i)) {
- if (I->getType()->isVoidTy())
+ for (unsigned I = 0; I < IG->getFactor(); ++I)
+ if (Instruction *Inst = IG->getMember(I)) {
+ if (Inst->getType()->isVoidTy())
continue;
- new VPValue(I, this);
+ new VPValue(Inst, this);
}
for (auto *SV : StoredValues)
@@ -3969,7 +3969,7 @@ public:
}
/// Prepare the plan for execution, setting up the required live-in values.
- void prepareToExecute(Value *VectorTripCount, VPTransformState &State);
+ void prepareToExecute(VPTransformState &State);
/// Generate the IR code for this VPlan.
void execute(VPTransformState *State);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 0cb704c..34b2abf 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2185,6 +2185,21 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
"User of VF that we can't transform to EVL.");
Plan.getVF().replaceAllUsesWith(&EVL);
+ assert(all_of(Plan.getVFxUF().users(),
+ [&Plan](VPUser *U) {
+ return match(U, m_c_Binary<Instruction::Add>(
+ m_Specific(Plan.getCanonicalIV()),
+ m_Specific(&Plan.getVFxUF()))) ||
+ isa<VPWidenPointerInductionRecipe>(U);
+ }) &&
+ "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
+ "increment of the canonical induction.");
+ Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
+ // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
+ // canonical induction must not be updated.
+ return isa<VPWidenPointerInductionRecipe>(U);
+ });
+
// Defer erasing recipes till the end so that we don't invalidate the
// VPTypeAnalysis cache.
SmallVector<VPRecipeBase *> ToErase;
@@ -2320,16 +2335,9 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
/// ...
///
-bool VPlanTransforms::tryAddExplicitVectorLength(
+void VPlanTransforms::addExplicitVectorLength(
VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock();
- // The transform updates all users of inductions to work based on EVL, instead
- // of the VF directly. At the moment, widened pointer inductions cannot be
- // updated, so bail out if the plan contains any.
- bool ContainsWidenPointerInductions =
- any_of(Header->phis(), IsaPred<VPWidenPointerInductionRecipe>);
- if (ContainsWidenPointerInductions)
- return false;
auto *CanonicalIVPHI = Plan.getCanonicalIV();
auto *CanIVTy = CanonicalIVPHI->getScalarType();
@@ -2384,7 +2392,6 @@ bool VPlanTransforms::tryAddExplicitVectorLength(
CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
// TODO: support unroll factor > 1.
Plan.setUF(1);
- return true;
}
void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) {
@@ -2808,13 +2815,12 @@ static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R,
R->replaceAllUsesWith(PtrAdd);
// Create the backedge value for the scalar pointer phi.
- Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
+ VPBasicBlock *ExitingBB = Plan->getVectorLoopRegion()->getExitingBasicBlock();
+ Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),
DL);
VPValue *Inc = Builder.createNaryOp(Instruction::Mul, {Step, VF});
- VPBasicBlock *ExitingBB = Plan->getVectorLoopRegion()->getExitingBasicBlock();
- Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
VPValue *InductionGEP =
Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
ScalarPtrPhi->addOperand(InductionGEP);
@@ -3272,6 +3278,67 @@ void VPlanTransforms::materializeBackedgeTakenCount(VPlan &Plan,
BTC->replaceAllUsesWith(TCMO);
}
+void VPlanTransforms::materializeVectorTripCount(VPlan &Plan,
+ VPBasicBlock *VectorPHVPBB,
+ bool TailByMasking,
+ bool RequiresScalarEpilogue) {
+ VPValue &VectorTC = Plan.getVectorTripCount();
+ assert(VectorTC.isLiveIn() && "vector-trip-count must be a live-in");
+ // There's nothing to do if there are no users of the vector trip count or its
+ // IR value has already been set.
+ if (VectorTC.getNumUsers() == 0 || VectorTC.getLiveInIRValue())
+ return;
+
+ VPValue *TC = Plan.getTripCount();
+ Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC);
+ VPBuilder Builder(VectorPHVPBB, VectorPHVPBB->begin());
+ VPValue *Step = &Plan.getVFxUF();
+
+ // If the tail is to be folded by masking, round the number of iterations N
+ // up to a multiple of Step instead of rounding down. This is done by first
+ // adding Step-1 and then rounding down. Note that it's ok if this addition
+ // overflows: the vector induction variable will eventually wrap to zero given
+ // that it starts at zero and its Step is a power of two; the loop will then
+ // exit, with the last early-exit vector comparison also producing all-true.
+ // For scalable vectors the VF is not guaranteed to be a power of 2, but this
+ // is accounted for in emitIterationCountCheck that adds an overflow check.
+ if (TailByMasking) {
+ TC = Builder.createNaryOp(
+ Instruction::Add,
+ {TC, Builder.createNaryOp(
+ Instruction::Sub,
+ {Step, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))})},
+ DebugLoc::getCompilerGenerated(), "n.rnd.up");
+ }
+
+ // Now we need to generate the expression for the part of the loop that the
+ // vectorized body will execute. This is equal to N - (N % Step) if scalar
+ // iterations are not required for correctness, or N - Step, otherwise. Step
+ // is equal to the vectorization factor (number of SIMD elements) times the
+ // unroll factor (number of SIMD instructions).
+ VPValue *R =
+ Builder.createNaryOp(Instruction::URem, {TC, Step},
+ DebugLoc::getCompilerGenerated(), "n.mod.vf");
+
+ // There are cases where we *must* run at least one iteration in the remainder
+ // loop. See the cost model for when this can happen. If the step evenly
+ // divides the trip count, we set the remainder to be equal to the step. If
+ // the step does not evenly divide the trip count, no adjustment is necessary
+ // since there will already be scalar iterations. Note that the minimum
+ // iterations check ensures that N >= Step.
+ if (RequiresScalarEpilogue) {
+ assert(!TailByMasking &&
+ "requiring scalar epilogue is not supported with fail folding");
+ VPValue *IsZero = Builder.createICmp(
+ CmpInst::ICMP_EQ, R, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 0)));
+ R = Builder.createSelect(IsZero, Step, R);
+ }
+
+ VPValue *Res = Builder.createNaryOp(
+ Instruction::Sub, {TC, R}, DebugLoc::getCompilerGenerated(), "n.vec");
+ VectorTC.replaceAllUsesWith(Res);
+}
+
/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index ecaca72..2afe956 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -177,10 +177,9 @@ struct VPlanTransforms {
/// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe.
/// VPCanonicalIVPHIRecipe is only used to control the loop after
/// this transformation.
- /// \returns true if the transformation succeeds, or false if it doesn't.
- static bool
- tryAddExplicitVectorLength(VPlan &Plan,
- const std::optional<unsigned> &MaxEVLSafeElements);
+ static void
+ addExplicitVectorLength(VPlan &Plan,
+ const std::optional<unsigned> &MaxEVLSafeElements);
// For each Interleave Group in \p InterleaveGroups replace the Recipes
// widening its memory instructions with a single VPInterleaveRecipe at its
@@ -257,6 +256,12 @@ struct VPlanTransforms {
unsigned BestUF,
PredicatedScalarEvolution &PSE);
+ /// Materialize vector trip count computations to a set of VPInstructions.
+ static void materializeVectorTripCount(VPlan &Plan,
+ VPBasicBlock *VectorPHVPBB,
+ bool TailByMasking,
+ bool RequiresScalarEpilogue);
+
/// Materialize the backedge-taken count to be computed explicitly using
/// VPInstructions.
static void materializeBackedgeTakenCount(VPlan &Plan,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 14ae4f2..3417e1c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -157,7 +157,7 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
return VerifyEVLUse(*S, S->getNumOperands() - 1);
})
.Case<VPWidenStoreEVLRecipe, VPReductionEVLRecipe,
- VPWidenIntOrFpInductionRecipe>(
+ VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe>(
[&](const VPRecipeBase *S) { return VerifyEVLUse(*S, 2); })
.Case<VPScalarIVStepsRecipe>([&](auto *R) {
if (R->getNumOperands() != 3) {