aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Analysis/ConstantFolding.cpp5
-rw-r--r--llvm/lib/Analysis/InstructionSimplify.cpp2
-rw-r--r--llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp4
-rw-r--r--llvm/lib/Analysis/VectorUtils.cpp24
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp7
-rw-r--r--llvm/lib/CodeGen/BranchFolding.cpp48
-rw-r--r--llvm/lib/CodeGen/CommandFlags.cpp7
-rw-r--r--llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp24
-rw-r--r--llvm/lib/CodeGen/GlobalISel/CallLowering.cpp6
-rw-r--r--llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp6
-rw-r--r--llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp4
-rw-r--r--llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp20
-rw-r--r--llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp19
-rw-r--r--llvm/lib/CodeGen/MIRParser/MILexer.cpp1
-rw-r--r--llvm/lib/CodeGen/MIRParser/MILexer.h1
-rw-r--r--llvm/lib/CodeGen/MIRParser/MIParser.cpp7
-rw-r--r--llvm/lib/CodeGen/MIRParser/MIRParser.cpp14
-rw-r--r--llvm/lib/CodeGen/MIRPrinter.cpp12
-rw-r--r--llvm/lib/CodeGen/MachineFunction.cpp5
-rw-r--r--llvm/lib/CodeGen/MachineOperand.cpp4
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp49
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp2
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp12
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp10
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp2
-rw-r--r--llvm/lib/CodeGen/TargetLoweringBase.cpp34
-rw-r--r--llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp3
-rw-r--r--llvm/lib/CodeGen/WindowsSecureHotPatching.cpp13
-rw-r--r--llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h3
-rw-r--r--llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp26
-rw-r--r--llvm/lib/IR/IRBuilder.cpp29
-rw-r--r--llvm/lib/IR/Intrinsics.cpp24
-rw-r--r--llvm/lib/IR/Metadata.cpp2
-rw-r--r--llvm/lib/MC/MCMachOStreamer.cpp6
-rw-r--r--llvm/lib/MC/MCObjectStreamer.cpp148
-rw-r--r--llvm/lib/MC/MCWin64EH.cpp3
-rw-r--r--llvm/lib/MC/MCWinCOFFStreamer.cpp5
-rw-r--r--llvm/lib/MC/MCXCOFFStreamer.cpp12
-rw-r--r--llvm/lib/MC/MachObjectWriter.cpp2
-rw-r--r--llvm/lib/Object/IRSymtab.cpp11
-rw-r--r--llvm/lib/Object/SFrameParser.cpp5
-rw-r--r--llvm/lib/Passes/PassRegistry.def1
-rw-r--r--llvm/lib/Support/Debug.cpp69
-rw-r--r--llvm/lib/Support/Unix/Path.inc2
-rw-r--r--llvm/lib/Support/Windows/Threading.inc60
-rw-r--r--llvm/lib/TableGen/Record.cpp26
-rw-r--r--llvm/lib/TableGen/TGLexer.cpp2
-rw-r--r--llvm/lib/TableGen/TGLexer.h2
-rw-r--r--llvm/lib/TableGen/TGParser.cpp53
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp22
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.h2
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.cpp9
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.td7
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGISel.td3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp18
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp93
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp11
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/FLATInstructions.td126
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.cpp57
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.h7
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp123
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.h1
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp29
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp26
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h6
-rw-r--r--llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp37
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp61
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.h5
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp9
-rw-r--r--llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp3
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp11
-rw-r--r--llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp7
-rw-r--r--llvm/lib/Target/BPF/BTFDebug.cpp74
-rw-r--r--llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp2
-rw-r--r--llvm/lib/Target/Hexagon/HexagonMask.cpp2
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp37
-rw-r--r--llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp6
-rw-r--r--llvm/lib/Target/Mips/MipsAsmPrinter.cpp8
-rw-r--r--llvm/lib/Target/Mips/MipsISelLowering.cpp29
-rw-r--r--llvm/lib/Target/Mips/MipsISelLowering.h1
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp273
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h2
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp629
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelLowering.h10
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXInstrInfo.td133
-rw-r--r--llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp9
-rw-r--r--llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h3
-rw-r--r--llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp8
-rw-r--r--llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp38
-rw-r--r--llvm/lib/Target/RISCV/RISCVFrameLowering.cpp3
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp79
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.h1
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoP.td3
-rw-r--r--llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp25
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h4
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp45
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td2
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp28
-rw-r--r--llvm/lib/TargetParser/RISCVISAInfo.cpp19
-rw-r--r--llvm/lib/TargetParser/Triple.cpp6
-rw-r--r--llvm/lib/Transforms/HipStdPar/HipStdPar.cpp118
-rw-r--r--llvm/lib/Transforms/IPO/FunctionAttrs.cpp34
-rw-r--r--llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp148
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp8
-rw-r--r--llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp2
-rw-r--r--llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp9
-rw-r--r--llvm/lib/Transforms/Scalar/LoopFuse.cpp25
-rw-r--r--llvm/lib/Transforms/Scalar/NewGVN.cpp20
-rw-r--r--llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp2
-rw-r--r--llvm/lib/Transforms/Utils/SCCPSolver.cpp143
-rw-r--r--llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp2
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp93
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.h8
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp12
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp12
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp20
126 files changed, 2066 insertions, 1624 deletions
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 759c553..7341dad 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -1373,7 +1373,7 @@ Constant *llvm::FlushFPConstant(Constant *Operand, const Instruction *Inst,
if (ConstantFP *CFP = dyn_cast<ConstantFP>(Operand))
return flushDenormalConstantFP(CFP, Inst, IsOutput);
- if (isa<ConstantAggregateZero, UndefValue, ConstantExpr>(Operand))
+ if (isa<ConstantAggregateZero, UndefValue>(Operand))
return Operand;
Type *Ty = Operand->getType();
@@ -1389,6 +1389,9 @@ Constant *llvm::FlushFPConstant(Constant *Operand, const Instruction *Inst,
Ty = VecTy->getElementType();
}
+ if (isa<ConstantExpr>(Operand))
+ return Operand;
+
if (const auto *CV = dyn_cast<ConstantVector>(Operand)) {
SmallVector<Constant *, 16> NewElts;
for (unsigned i = 0, e = CV->getNumOperands(); i != e; ++i) {
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 82530e7..5907e21 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -5366,7 +5366,7 @@ static Value *simplifyCastInst(unsigned CastOpc, Value *Op, Type *Ty,
Type *MidTy = CI->getType();
Type *DstTy = Ty;
if (Src->getType() == Ty) {
- auto FirstOp = static_cast<Instruction::CastOps>(CI->getOpcode());
+ auto FirstOp = CI->getOpcode();
auto SecondOp = static_cast<Instruction::CastOps>(CastOpc);
Type *SrcIntPtrTy =
SrcTy->isPtrOrPtrVectorTy() ? Q.DL.getIntPtrType(SrcTy) : nullptr;
diff --git a/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp b/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
index c871070..7025b83 100644
--- a/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
@@ -525,6 +525,8 @@ AAMDNodes AAMDNodes::merge(const AAMDNodes &Other) const {
Result.TBAAStruct = nullptr;
Result.Scope = MDNode::getMostGenericAliasScope(Scope, Other.Scope);
Result.NoAlias = MDNode::intersect(NoAlias, Other.NoAlias);
+ Result.NoAliasAddrSpace = MDNode::getMostGenericNoaliasAddrspace(
+ NoAliasAddrSpace, Other.NoAliasAddrSpace);
return Result;
}
@@ -533,6 +535,8 @@ AAMDNodes AAMDNodes::concat(const AAMDNodes &Other) const {
Result.TBAA = Result.TBAAStruct = nullptr;
Result.Scope = MDNode::getMostGenericAliasScope(Scope, Other.Scope);
Result.NoAlias = MDNode::intersect(NoAlias, Other.NoAlias);
+ Result.NoAliasAddrSpace = MDNode::getMostGenericNoaliasAddrspace(
+ NoAliasAddrSpace, Other.NoAliasAddrSpace);
return Result;
}
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 1b3da59..150ddce 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -240,30 +240,6 @@ Intrinsic::ID llvm::getVectorIntrinsicIDForCall(const CallInst *CI,
return Intrinsic::not_intrinsic;
}
-struct InterleaveIntrinsic {
- Intrinsic::ID Interleave, Deinterleave;
-};
-
-static InterleaveIntrinsic InterleaveIntrinsics[] = {
- {Intrinsic::vector_interleave2, Intrinsic::vector_deinterleave2},
- {Intrinsic::vector_interleave3, Intrinsic::vector_deinterleave3},
- {Intrinsic::vector_interleave4, Intrinsic::vector_deinterleave4},
- {Intrinsic::vector_interleave5, Intrinsic::vector_deinterleave5},
- {Intrinsic::vector_interleave6, Intrinsic::vector_deinterleave6},
- {Intrinsic::vector_interleave7, Intrinsic::vector_deinterleave7},
- {Intrinsic::vector_interleave8, Intrinsic::vector_deinterleave8},
-};
-
-Intrinsic::ID llvm::getInterleaveIntrinsicID(unsigned Factor) {
- assert(Factor >= 2 && Factor <= 8 && "Unexpected factor");
- return InterleaveIntrinsics[Factor - 2].Interleave;
-}
-
-Intrinsic::ID llvm::getDeinterleaveIntrinsicID(unsigned Factor) {
- assert(Factor >= 2 && Factor <= 8 && "Unexpected factor");
- return InterleaveIntrinsics[Factor - 2].Deinterleave;
-}
-
unsigned llvm::getInterleaveIntrinsicFactor(Intrinsic::ID ID) {
switch (ID) {
case Intrinsic::vector_interleave2:
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 11b8576..7188833 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -972,10 +972,9 @@ void DwarfDebug::constructCallSiteEntryDIEs(const DISubprogram &SP,
// the call graph which could lead to some target function. For tail
// calls, no return PC information is needed, unless tuning for GDB in
// DWARF4 mode in which case we fake a return PC for compatibility.
- const MCSymbol *PCAddr =
- (!IsTail || CU.useGNUAnalogForDwarf5Feature())
- ? const_cast<MCSymbol *>(getLabelAfterInsn(TopLevelCallMI))
- : nullptr;
+ const MCSymbol *PCAddr = (!IsTail || CU.useGNUAnalogForDwarf5Feature())
+ ? getLabelAfterInsn(TopLevelCallMI)
+ : nullptr;
// For tail calls, it's necessary to record the address of the branch
// instruction so that the debugger can show where the tail call occurred.
diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp
index 3b3e7a4..dcfd9aa 100644
--- a/llvm/lib/CodeGen/BranchFolding.cpp
+++ b/llvm/lib/CodeGen/BranchFolding.cpp
@@ -2083,22 +2083,55 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
if (TBB == FBB) {
MBB->splice(Loc, TBB, TBB->begin(), TIB);
} else {
+ // Merge the debug locations, and hoist and kill the debug instructions from
+ // both branches. FIXME: We could probably try harder to preserve some debug
+ // instructions (but at least this isn't producing wrong locations).
+ MachineInstrBuilder MIRBuilder(*MBB->getParent(), Loc);
+ auto HoistAndKillDbgInstr = [MBB, Loc](MachineBasicBlock::iterator DI) {
+ assert(DI->isDebugInstr() && "Expected a debug instruction");
+ if (DI->isDebugRef()) {
+ const TargetInstrInfo *TII =
+ MBB->getParent()->getSubtarget().getInstrInfo();
+ const MCInstrDesc &DBGV = TII->get(TargetOpcode::DBG_VALUE);
+ DI = BuildMI(*MBB->getParent(), DI->getDebugLoc(), DBGV, false, 0,
+ DI->getDebugVariable(), DI->getDebugExpression());
+ MBB->insert(Loc, &*DI);
+ return;
+ }
+ // Deleting a DBG_PHI results in an undef at the referenced DBG_INSTR_REF.
+ if (DI->isDebugPHI()) {
+ DI->eraseFromParent();
+ return;
+ }
+ // Move DBG_LABELs without modifying them. Set DBG_VALUEs undef.
+ if (!DI->isDebugLabel())
+ DI->setDebugValueUndef();
+ DI->moveBefore(&*Loc);
+ };
+
// TIB and FIB point to the end of the regions to hoist/merge in TBB and
// FBB.
MachineBasicBlock::iterator FE = FIB;
MachineBasicBlock::iterator FI = FBB->begin();
for (MachineBasicBlock::iterator TI :
make_early_inc_range(make_range(TBB->begin(), TIB))) {
- // Move debug instructions and pseudo probes without modifying them.
- // FIXME: This is the wrong thing to do for debug locations, which
- // should at least be killed (and hoisted from BOTH blocks).
- if (TI->isDebugOrPseudoInstr()) {
- TI->moveBefore(&*Loc);
+ // Hoist and kill debug instructions from FBB. After this loop FI points
+ // to the next non-debug instruction to hoist (checked in assert after the
+ // TBB debug instruction handling code).
+ while (FI != FE && FI->isDebugInstr())
+ HoistAndKillDbgInstr(FI++);
+
+ // Kill debug instructions before moving.
+ if (TI->isDebugInstr()) {
+ HoistAndKillDbgInstr(TI);
continue;
}
- // Get the next non-meta instruction in FBB.
- FI = skipDebugInstructionsForward(FI, FE, false);
+ // FI and TI now point to identical non-debug instructions.
+ assert(FI != FE && "Unexpected end of FBB range");
+ // Pseudo probes are excluded from the range when identifying foldable
+ // instructions, so we don't expect to see one now.
+ assert(!TI->isPseudoProbe() && "Unexpected pseudo probe in range");
// NOTE: The loop above checks CheckKillDead but we can't do that here as
// it modifies some kill markers after the check.
assert(TI->isIdenticalTo(*FI, MachineInstr::CheckDefs) &&
@@ -2111,6 +2144,7 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
++FI;
}
}
+
FBB->erase(FBB->begin(), FIB);
if (UpdateLiveIns)
diff --git a/llvm/lib/CodeGen/CommandFlags.cpp b/llvm/lib/CodeGen/CommandFlags.cpp
index 9512f79..810dc29 100644
--- a/llvm/lib/CodeGen/CommandFlags.cpp
+++ b/llvm/lib/CodeGen/CommandFlags.cpp
@@ -101,6 +101,7 @@ CGOPT(EABI, EABIVersion)
CGOPT(DebuggerKind, DebuggerTuningOpt)
CGOPT(bool, EnableStackSizeSection)
CGOPT(bool, EnableAddrsig)
+CGOPT(bool, EnableCallGraphSection)
CGOPT(bool, EmitCallSiteInfo)
CGOPT(bool, EnableMachineFunctionSplitter)
CGOPT(bool, EnableStaticDataPartitioning)
@@ -461,6 +462,11 @@ codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() {
cl::init(false));
CGBINDOPT(EnableAddrsig);
+ static cl::opt<bool> EnableCallGraphSection(
+ "call-graph-section", cl::desc("Emit a call graph section"),
+ cl::init(false));
+ CGBINDOPT(EnableCallGraphSection);
+
static cl::opt<bool> EmitCallSiteInfo(
"emit-call-site-info",
cl::desc(
@@ -595,6 +601,7 @@ codegen::InitTargetOptionsFromCodeGenFlags(const Triple &TheTriple) {
Options.EnableMachineFunctionSplitter = getEnableMachineFunctionSplitter();
Options.EnableStaticDataPartitioning = getEnableStaticDataPartitioning();
Options.EmitAddrsig = getEnableAddrsig();
+ Options.EmitCallGraphSection = getEnableCallGraphSection();
Options.EmitCallSiteInfo = getEmitCallSiteInfo();
Options.EnableDebugEntryValues = getEnableDebugEntryValues();
Options.ForceDwarfFrameSection = getForceDwarfFrameSection();
diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index 8855740f..9b2851e 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -2186,19 +2186,16 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder,
llvm_unreachable("Deinterleave node should already have ReplacementNode");
break;
case ComplexDeinterleavingOperation::Splat: {
- auto *NewTy = VectorType::getDoubleElementsVectorType(
- cast<VectorType>(Node->Real->getType()));
auto *R = dyn_cast<Instruction>(Node->Real);
auto *I = dyn_cast<Instruction>(Node->Imag);
if (R && I) {
// Splats that are not constant are interleaved where they are located
Instruction *InsertPoint = (I->comesBefore(R) ? R : I)->getNextNode();
IRBuilder<> IRB(InsertPoint);
- ReplacementNode = IRB.CreateIntrinsic(Intrinsic::vector_interleave2,
- NewTy, {Node->Real, Node->Imag});
+ ReplacementNode = IRB.CreateVectorInterleave({Node->Real, Node->Imag});
} else {
- ReplacementNode = Builder.CreateIntrinsic(
- Intrinsic::vector_interleave2, NewTy, {Node->Real, Node->Imag});
+ ReplacementNode =
+ Builder.CreateVectorInterleave({Node->Real, Node->Imag});
}
break;
}
@@ -2226,10 +2223,7 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder,
auto *MaskImag = cast<Instruction>(Node->Imag)->getOperand(0);
auto *A = replaceNode(Builder, Node->Operands[0]);
auto *B = replaceNode(Builder, Node->Operands[1]);
- auto *NewMaskTy = VectorType::getDoubleElementsVectorType(
- cast<VectorType>(MaskReal->getType()));
- auto *NewMask = Builder.CreateIntrinsic(Intrinsic::vector_interleave2,
- NewMaskTy, {MaskReal, MaskImag});
+ auto *NewMask = Builder.CreateVectorInterleave({MaskReal, MaskImag});
ReplacementNode = Builder.CreateSelect(NewMask, A, B);
break;
}
@@ -2260,8 +2254,8 @@ void ComplexDeinterleavingGraph::processReductionSingle(
}
if (!NewInit)
- NewInit = Builder.CreateIntrinsic(Intrinsic::vector_interleave2, NewVTy,
- {Init, Constant::getNullValue(VTy)});
+ NewInit =
+ Builder.CreateVectorInterleave({Init, Constant::getNullValue(VTy)});
NewPHI->addIncoming(NewInit, Incoming);
NewPHI->addIncoming(OperationReplacement, BackEdge);
@@ -2281,16 +2275,12 @@ void ComplexDeinterleavingGraph::processReductionOperation(
auto *OldPHIImag = ReductionInfo[Imag].first;
auto *NewPHI = OldToNewPHI[OldPHIReal];
- auto *VTy = cast<VectorType>(Real->getType());
- auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy);
-
// We have to interleave initial origin values coming from IncomingBlock
Value *InitReal = OldPHIReal->getIncomingValueForBlock(Incoming);
Value *InitImag = OldPHIImag->getIncomingValueForBlock(Incoming);
IRBuilder<> Builder(Incoming->getTerminator());
- auto *NewInit = Builder.CreateIntrinsic(Intrinsic::vector_interleave2, NewVTy,
- {InitReal, InitImag});
+ auto *NewInit = Builder.CreateVectorInterleave({InitReal, InitImag});
NewPHI->addIncoming(NewInit, Incoming);
NewPHI->addIncoming(OperationReplacement, BackEdge);
diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index 012d873..9ba1782 100644
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -1009,7 +1009,8 @@ void CallLowering::insertSRetLoads(MachineIRBuilder &MIRBuilder, Type *RetTy,
for (unsigned I = 0; I < NumValues; ++I) {
Register Addr;
- MIRBuilder.materializePtrAdd(Addr, DemoteReg, OffsetLLTy, Offsets[I]);
+ MIRBuilder.materializeObjectPtrOffset(Addr, DemoteReg, OffsetLLTy,
+ Offsets[I]);
auto *MMO = MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
MRI.getType(VRegs[I]),
commonAlignment(BaseAlign, Offsets[I]));
@@ -1039,7 +1040,8 @@ void CallLowering::insertSRetStores(MachineIRBuilder &MIRBuilder, Type *RetTy,
for (unsigned I = 0; I < NumValues; ++I) {
Register Addr;
- MIRBuilder.materializePtrAdd(Addr, DemoteReg, OffsetLLTy, Offsets[I]);
+ MIRBuilder.materializeObjectPtrOffset(Addr, DemoteReg, OffsetLLTy,
+ Offsets[I]);
auto *MMO = MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
MRI.getType(VRegs[I]),
commonAlignment(BaseAlign, Offsets[I]));
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index e8f513a..e84ba91 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -5949,8 +5949,7 @@ bool CombinerHelper::canCombineFMadOrFMA(MachineInstr &MI,
const TargetOptions &Options = MF->getTarget().Options;
LLT DstType = MRI.getType(MI.getOperand(0).getReg());
- if (CanReassociate &&
- !(Options.UnsafeFPMath || MI.getFlag(MachineInstr::MIFlag::FmReassoc)))
+ if (CanReassociate && !MI.getFlag(MachineInstr::MIFlag::FmReassoc))
return false;
// Floating-point multiply-add with intermediate rounding.
@@ -5962,8 +5961,7 @@ bool CombinerHelper::canCombineFMadOrFMA(MachineInstr &MI,
if (!HasFMAD && !HasFMA)
return false;
- AllowFusionGlobally = Options.AllowFPOpFusion == FPOpFusion::Fast ||
- Options.UnsafeFPMath || HasFMAD;
+ AllowFusionGlobally = Options.AllowFPOpFusion == FPOpFusion::Fast || HasFMAD;
// If the addition is not contractable, do not combine.
if (!AllowFusionGlobally && !MI.getFlag(MachineInstr::MIFlag::FmContract))
return false;
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index dc5dfab..fd38c30 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -1409,7 +1409,7 @@ bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) {
Regs.size() == 1 ? LI.getMetadata(LLVMContext::MD_range) : nullptr;
for (unsigned i = 0; i < Regs.size(); ++i) {
Register Addr;
- MIRBuilder.materializePtrAdd(Addr, Base, OffsetTy, Offsets[i] / 8);
+ MIRBuilder.materializeObjectPtrOffset(Addr, Base, OffsetTy, Offsets[i] / 8);
MachinePointerInfo Ptr(LI.getPointerOperand(), Offsets[i] / 8);
Align BaseAlign = getMemOpAlign(LI);
@@ -1448,7 +1448,7 @@ bool IRTranslator::translateStore(const User &U, MachineIRBuilder &MIRBuilder) {
for (unsigned i = 0; i < Vals.size(); ++i) {
Register Addr;
- MIRBuilder.materializePtrAdd(Addr, Base, OffsetTy, Offsets[i] / 8);
+ MIRBuilder.materializeObjectPtrOffset(Addr, Base, OffsetTy, Offsets[i] / 8);
MachinePointerInfo Ptr(SI.getPointerOperand(), Offsets[i] / 8);
Align BaseAlign = getMemOpAlign(SI);
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index ed7b07f..d9d3569 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -4170,7 +4170,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) {
auto OffsetCst = MIRBuilder.buildConstant(LLT::scalar(PtrTy.getSizeInBits()),
LargeSplitSize / 8);
Register PtrAddReg = MRI.createGenericVirtualRegister(PtrTy);
- auto SmallPtr = MIRBuilder.buildPtrAdd(PtrAddReg, PtrReg, OffsetCst);
+ auto SmallPtr = MIRBuilder.buildObjectPtrOffset(PtrAddReg, PtrReg, OffsetCst);
auto SmallLoad = MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), AnyExtTy,
SmallPtr, *SmallMMO);
@@ -4277,8 +4277,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) {
LLT PtrTy = MRI.getType(PtrReg);
auto OffsetCst = MIRBuilder.buildConstant(
LLT::scalar(PtrTy.getSizeInBits()), LargeSplitSize / 8);
- auto SmallPtr =
- MIRBuilder.buildPtrAdd(PtrTy, PtrReg, OffsetCst);
+ auto SmallPtr = MIRBuilder.buildObjectPtrOffset(PtrTy, PtrReg, OffsetCst);
MachineMemOperand *LargeMMO =
MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
@@ -5349,7 +5348,8 @@ LegalizerHelper::reduceLoadStoreWidth(GLoadStore &LdStMI, unsigned TypeIdx,
unsigned ByteOffset = Offset / 8;
Register NewAddrReg;
- MIRBuilder.materializePtrAdd(NewAddrReg, AddrReg, OffsetTy, ByteOffset);
+ MIRBuilder.materializeObjectPtrOffset(NewAddrReg, AddrReg, OffsetTy,
+ ByteOffset);
MachineMemOperand *NewMMO =
MF.getMachineMemOperand(&MMO, ByteOffset, PartTy);
@@ -8004,7 +8004,7 @@ LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) {
if (MRI.getType(Src).isVector()) // TODO: Handle vectors directly.
return UnableToLegalize;
- if (MIRBuilder.getMF().getTarget().Options.UnsafeFPMath) {
+ if (MI.getFlag(MachineInstr::FmAfn)) {
unsigned Flags = MI.getFlags();
auto Src32 = MIRBuilder.buildFPTrunc(S32, Src, Flags);
MIRBuilder.buildFPTrunc(Dst, Src32, Flags);
@@ -9822,7 +9822,7 @@ LegalizerHelper::lowerMemset(MachineInstr &MI, Register Dst, Register Val,
if (DstOff != 0) {
auto Offset =
MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), DstOff);
- Ptr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
+ Ptr = MIB.buildObjectPtrOffset(PtrTy, Dst, Offset).getReg(0);
}
MIB.buildStore(Value, Ptr, *StoreMMO);
@@ -9962,7 +9962,7 @@ LegalizerHelper::lowerMemcpy(MachineInstr &MI, Register Dst, Register Src,
LLT SrcTy = MRI.getType(Src);
Offset = MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset)
.getReg(0);
- LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0);
+ LoadPtr = MIB.buildObjectPtrOffset(SrcTy, Src, Offset).getReg(0);
}
auto LdVal = MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO);
@@ -9970,7 +9970,7 @@ LegalizerHelper::lowerMemcpy(MachineInstr &MI, Register Dst, Register Src,
Register StorePtr = Dst;
if (CurrOffset != 0) {
LLT DstTy = MRI.getType(Dst);
- StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0);
+ StorePtr = MIB.buildObjectPtrOffset(DstTy, Dst, Offset).getReg(0);
}
MIB.buildStore(LdVal, StorePtr, *StoreMMO);
CurrOffset += CopyTy.getSizeInBytes();
@@ -10060,7 +10060,7 @@ LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src,
LLT SrcTy = MRI.getType(Src);
auto Offset =
MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset);
- LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0);
+ LoadPtr = MIB.buildObjectPtrOffset(SrcTy, Src, Offset).getReg(0);
}
LoadVals.push_back(MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO).getReg(0));
CurrOffset += CopyTy.getSizeInBytes();
@@ -10078,7 +10078,7 @@ LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src,
LLT DstTy = MRI.getType(Dst);
auto Offset =
MIB.buildConstant(LLT::scalar(DstTy.getSizeInBits()), CurrOffset);
- StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0);
+ StorePtr = MIB.buildObjectPtrOffset(DstTy, Dst, Offset).getReg(0);
}
MIB.buildStore(LoadVals[I], StorePtr, *StoreMMO);
CurrOffset += CopyTy.getSizeInBytes();
diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
index 121d7e8..9aa8deb 100644
--- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -208,11 +208,18 @@ MachineIRBuilder::buildPtrAdd(const DstOp &Res, const SrcOp &Op0,
return buildInstr(TargetOpcode::G_PTR_ADD, {Res}, {Op0, Op1}, Flags);
}
+MachineInstrBuilder MachineIRBuilder::buildObjectPtrOffset(const DstOp &Res,
+ const SrcOp &Op0,
+ const SrcOp &Op1) {
+ return buildPtrAdd(Res, Op0, Op1, MachineInstr::MIFlag::NoUWrap);
+}
+
std::optional<MachineInstrBuilder>
MachineIRBuilder::materializePtrAdd(Register &Res, Register Op0,
- const LLT ValueTy, uint64_t Value) {
+ const LLT ValueTy, uint64_t Value,
+ std::optional<unsigned> Flags) {
assert(Res == 0 && "Res is a result argument");
- assert(ValueTy.isScalar() && "invalid offset type");
+ assert(ValueTy.isScalar() && "invalid offset type");
if (Value == 0) {
Res = Op0;
@@ -221,7 +228,13 @@ MachineIRBuilder::materializePtrAdd(Register &Res, Register Op0,
Res = getMRI()->createGenericVirtualRegister(getMRI()->getType(Op0));
auto Cst = buildConstant(ValueTy, Value);
- return buildPtrAdd(Res, Op0, Cst.getReg(0));
+ return buildPtrAdd(Res, Op0, Cst.getReg(0), Flags);
+}
+
+std::optional<MachineInstrBuilder> MachineIRBuilder::materializeObjectPtrOffset(
+ Register &Res, Register Op0, const LLT ValueTy, uint64_t Value) {
+ return materializePtrAdd(Res, Op0, ValueTy, Value,
+ MachineInstr::MIFlag::NoUWrap);
}
MachineInstrBuilder MachineIRBuilder::buildMaskLowPtrBits(const DstOp &Res,
diff --git a/llvm/lib/CodeGen/MIRParser/MILexer.cpp b/llvm/lib/CodeGen/MIRParser/MILexer.cpp
index 7153902..193df1f 100644
--- a/llvm/lib/CodeGen/MIRParser/MILexer.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MILexer.cpp
@@ -616,6 +616,7 @@ static MIToken::TokenKind getMetadataKeywordKind(StringRef Identifier) {
.Case("!range", MIToken::md_range)
.Case("!DIExpression", MIToken::md_diexpr)
.Case("!DILocation", MIToken::md_dilocation)
+ .Case("!noalias.addrspace", MIToken::md_noalias_addrspace)
.Default(MIToken::Error);
}
diff --git a/llvm/lib/CodeGen/MIRParser/MILexer.h b/llvm/lib/CodeGen/MIRParser/MILexer.h
index d7cd067..54142ac 100644
--- a/llvm/lib/CodeGen/MIRParser/MILexer.h
+++ b/llvm/lib/CodeGen/MIRParser/MILexer.h
@@ -151,6 +151,7 @@ struct MIToken {
md_tbaa,
md_alias_scope,
md_noalias,
+ md_noalias_addrspace,
md_range,
md_diexpr,
md_dilocation,
diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
index 3a364d5..807d59c 100644
--- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -3482,6 +3482,11 @@ bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) {
if (parseMDNode(AAInfo.NoAlias))
return true;
break;
+ case MIToken::md_noalias_addrspace:
+ lex();
+ if (parseMDNode(AAInfo.NoAliasAddrSpace))
+ return true;
+ break;
case MIToken::md_range:
lex();
if (parseMDNode(Range))
@@ -3490,7 +3495,7 @@ bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) {
// TODO: Report an error on duplicate metadata nodes.
default:
return error("expected 'align' or '!tbaa' or '!alias.scope' or "
- "'!noalias' or '!range'");
+ "'!noalias' or '!range' or '!noalias.addrspace'");
}
}
if (expectAndConsume(MIToken::rparen))
diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
index 1e9fcf3..3e99e57 100644
--- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
@@ -504,13 +504,21 @@ bool MIRParserImpl::initializeCallSiteInfo(
return error(Error, ArgRegPair.Reg.SourceRange);
CSInfo.ArgRegPairs.emplace_back(Reg, ArgRegPair.ArgNo);
}
+ if (!YamlCSInfo.CalleeTypeIds.empty()) {
+ for (auto CalleeTypeId : YamlCSInfo.CalleeTypeIds) {
+ IntegerType *Int64Ty = Type::getInt64Ty(Context);
+ CSInfo.CalleeTypeIds.push_back(ConstantInt::get(Int64Ty, CalleeTypeId,
+ /*isSigned=*/false));
+ }
+ }
- if (TM.Options.EmitCallSiteInfo)
+ if (TM.Options.EmitCallSiteInfo || TM.Options.EmitCallGraphSection)
MF.addCallSiteInfo(&*CallI, std::move(CSInfo));
}
- if (YamlMF.CallSitesInfo.size() && !TM.Options.EmitCallSiteInfo)
- return error(Twine("Call site info provided but not used"));
+ if (!YamlMF.CallSitesInfo.empty() &&
+ !(TM.Options.EmitCallSiteInfo || TM.Options.EmitCallGraphSection))
+ return error("call site info provided but not used");
return false;
}
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index bc4e299..ad7835a 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -525,24 +525,30 @@ static void convertCallSiteObjects(yaml::MachineFunction &YMF,
const MachineFunction &MF,
ModuleSlotTracker &MST) {
const auto *TRI = MF.getSubtarget().getRegisterInfo();
- for (auto CSInfo : MF.getCallSitesInfo()) {
+ for (auto [MI, CallSiteInfo] : MF.getCallSitesInfo()) {
yaml::CallSiteInfo YmlCS;
yaml::MachineInstrLoc CallLocation;
// Prepare instruction position.
- MachineBasicBlock::const_instr_iterator CallI = CSInfo.first->getIterator();
+ MachineBasicBlock::const_instr_iterator CallI = MI->getIterator();
CallLocation.BlockNum = CallI->getParent()->getNumber();
// Get call instruction offset from the beginning of block.
CallLocation.Offset =
std::distance(CallI->getParent()->instr_begin(), CallI);
YmlCS.CallLocation = CallLocation;
+
+ auto [ArgRegPairs, CalleeTypeIds] = CallSiteInfo;
// Construct call arguments and theirs forwarding register info.
- for (auto ArgReg : CSInfo.second.ArgRegPairs) {
+ for (auto ArgReg : ArgRegPairs) {
yaml::CallSiteInfo::ArgRegPair YmlArgReg;
YmlArgReg.ArgNo = ArgReg.ArgNo;
printRegMIR(ArgReg.Reg, YmlArgReg.Reg, TRI);
YmlCS.ArgForwardingRegs.emplace_back(YmlArgReg);
}
+ // Get type ids.
+ for (auto *CalleeTypeId : CalleeTypeIds) {
+ YmlCS.CalleeTypeIds.push_back(CalleeTypeId->getZExtValue());
+ }
YMF.CallSitesInfo.push_back(std::move(YmlCS));
}
diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp
index 429a17a..60d42e0 100644
--- a/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/llvm/lib/CodeGen/MachineFunction.cpp
@@ -211,8 +211,7 @@ void MachineFunction::init() {
ConstantPool = new (Allocator) MachineConstantPool(getDataLayout());
Alignment = STI->getTargetLowering()->getMinFunctionAlignment();
- // FIXME: Use Function::hasOptSize().
- if (!F.getAlign() && !F.hasFnAttribute(Attribute::OptimizeForSize))
+ if (!F.getAlign() && !F.hasOptSize())
Alignment = std::max(Alignment,
STI->getTargetLowering()->getPrefFunctionAlignment());
@@ -920,7 +919,7 @@ MachineFunction::getCallSiteInfo(const MachineInstr *MI) {
assert(MI->isCandidateForAdditionalCallInfo() &&
"Call site info refers only to call (MI) candidates");
- if (!Target.Options.EmitCallSiteInfo)
+ if (!Target.Options.EmitCallSiteInfo && !Target.Options.EmitCallGraphSection)
return CallSitesInfo.end();
return CallSitesInfo.find(MI);
}
diff --git a/llvm/lib/CodeGen/MachineOperand.cpp b/llvm/lib/CodeGen/MachineOperand.cpp
index 0d25169..c612f8de 100644
--- a/llvm/lib/CodeGen/MachineOperand.cpp
+++ b/llvm/lib/CodeGen/MachineOperand.cpp
@@ -1273,6 +1273,10 @@ void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
OS << ", !noalias ";
AAInfo.NoAlias->printAsOperand(OS, MST);
}
+ if (AAInfo.NoAliasAddrSpace) {
+ OS << ", !noalias.addrspace ";
+ AAInfo.NoAliasAddrSpace->printAsOperand(OS, MST);
+ }
if (getRanges()) {
OS << ", !range ";
getRanges()->printAsOperand(OS, MST);
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index d3df434..20b96f5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -35,6 +35,7 @@
#include "llvm/CodeGen/ByteProvider.h"
#include "llvm/CodeGen/DAGCombine.h"
#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/SDPatternMatch.h"
@@ -15262,23 +15263,31 @@ SDValue DAGCombiner::visitAssertExt(SDNode *N) {
}
}
- // If we have (AssertZext (and (AssertSext X, iX), M), iY) and Y is smaller
- // than X, and the And doesn't change the lower iX bits, we can move the
- // AssertZext in front of the And and drop the AssertSext.
if (Opcode == ISD::AssertZext && N0.getOpcode() == ISD::AND &&
- N0.hasOneUse() && N0.getOperand(0).getOpcode() == ISD::AssertSext &&
isa<ConstantSDNode>(N0.getOperand(1))) {
- SDValue BigA = N0.getOperand(0);
- EVT BigA_AssertVT = cast<VTSDNode>(BigA.getOperand(1))->getVT();
const APInt &Mask = N0.getConstantOperandAPInt(1);
- if (AssertVT.bitsLT(BigA_AssertVT) &&
- Mask.countr_one() >= BigA_AssertVT.getScalarSizeInBits()) {
- SDLoc DL(N);
- SDValue NewAssert =
- DAG.getNode(Opcode, DL, N->getValueType(0), BigA.getOperand(0), N1);
- return DAG.getNode(ISD::AND, DL, N->getValueType(0), NewAssert,
- N0.getOperand(1));
+
+ // If we have (AssertZext (and (AssertSext X, iX), M), iY) and Y is smaller
+ // than X, and the And doesn't change the lower iX bits, we can move the
+ // AssertZext in front of the And and drop the AssertSext.
+ if (N0.getOperand(0).getOpcode() == ISD::AssertSext && N0.hasOneUse()) {
+ SDValue BigA = N0.getOperand(0);
+ EVT BigA_AssertVT = cast<VTSDNode>(BigA.getOperand(1))->getVT();
+ if (AssertVT.bitsLT(BigA_AssertVT) &&
+ Mask.countr_one() >= BigA_AssertVT.getScalarSizeInBits()) {
+ SDLoc DL(N);
+ SDValue NewAssert =
+ DAG.getNode(Opcode, DL, N->getValueType(0), BigA.getOperand(0), N1);
+ return DAG.getNode(ISD::AND, DL, N->getValueType(0), NewAssert,
+ N0.getOperand(1));
+ }
}
+
+ // Remove AssertZext entirely if the mask guarantees the assertion cannot
+ // fail.
+ // TODO: Use KB countMinLeadingZeros to handle non-constant masks?
+ if (Mask.isIntN(AssertVT.getScalarSizeInBits()))
+ return N0;
}
return SDValue();
@@ -22778,8 +22787,10 @@ SDValue DAGCombiner::visitLIFETIME_END(SDNode *N) {
const BaseIndexOffset StoreBase = BaseIndexOffset::match(ST, DAG);
// If we store purely within object bounds just before its lifetime ends,
// we can remove the store.
- if (LifetimeEndBase.contains(DAG, LifetimeEnd->getSize() * 8, StoreBase,
- StoreSize.getFixedValue() * 8)) {
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ if (LifetimeEndBase.contains(
+ DAG, MFI.getObjectSize(LifetimeEnd->getFrameIndex()) * 8,
+ StoreBase, StoreSize.getFixedValue() * 8)) {
LLVM_DEBUG(dbgs() << "\nRemoving store:"; StoreBase.dump();
dbgs() << "\nwithin LIFETIME_END of : ";
LifetimeEndBase.dump(); dbgs() << "\n");
@@ -29415,7 +29426,7 @@ bool DAGCombiner::mayAlias(SDNode *Op0, SDNode *Op1) const {
MachineMemOperand *MMO;
};
- auto getCharacteristics = [](SDNode *N) -> MemUseCharacteristics {
+ auto getCharacteristics = [this](SDNode *N) -> MemUseCharacteristics {
if (const auto *LSN = dyn_cast<LSBaseSDNode>(N)) {
int64_t Offset = 0;
if (auto *C = dyn_cast<ConstantSDNode>(LSN->getOffset()))
@@ -29428,13 +29439,15 @@ bool DAGCombiner::mayAlias(SDNode *Op0, SDNode *Op1) const {
LSN->getBasePtr(), Offset /*base offset*/,
LocationSize::precise(Size), LSN->getMemOperand()};
}
- if (const auto *LN = cast<LifetimeSDNode>(N))
+ if (const auto *LN = cast<LifetimeSDNode>(N)) {
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
return {false /*isVolatile*/,
/*isAtomic*/ false,
LN->getOperand(1),
0,
- LocationSize::precise(LN->getSize()),
+ LocationSize::precise(MFI.getObjectSize(LN->getFrameIndex())),
(MachineMemOperand *)nullptr};
+ }
// Default.
return {false /*isvolatile*/,
/*isAtomic*/ false,
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 74172b2..ba0ab23 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -3853,7 +3853,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
break;
case ISD::FP_TO_FP16:
LLVM_DEBUG(dbgs() << "Legalizing FP_TO_FP16\n");
- if (!TLI.useSoftFloat() && TM.Options.UnsafeFPMath) {
+ if (Node->getFlags().hasApproximateFuncs() && !TLI.useSoftFloat()) {
SDValue Op = Node->getOperand(0);
MVT SVT = Op.getSimpleValueType();
if ((SVT == MVT::f64 || SVT == MVT::f80) &&
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 773ff48..5c586f7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -784,10 +784,6 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
case ISD::TargetFrameIndex:
ID.AddInteger(cast<FrameIndexSDNode>(N)->getIndex());
break;
- case ISD::LIFETIME_START:
- case ISD::LIFETIME_END:
- ID.AddInteger(cast<LifetimeSDNode>(N)->getSize());
- break;
case ISD::PSEUDO_PROBE:
ID.AddInteger(cast<PseudoProbeSDNode>(N)->getGuid());
ID.AddInteger(cast<PseudoProbeSDNode>(N)->getIndex());
@@ -9360,8 +9356,7 @@ SDValue SelectionDAG::getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl,
}
SDValue SelectionDAG::getLifetimeNode(bool IsStart, const SDLoc &dl,
- SDValue Chain, int FrameIndex,
- int64_t Size) {
+ SDValue Chain, int FrameIndex) {
const unsigned Opcode = IsStart ? ISD::LIFETIME_START : ISD::LIFETIME_END;
const auto VTs = getVTList(MVT::Other);
SDValue Ops[2] = {
@@ -9373,13 +9368,12 @@ SDValue SelectionDAG::getLifetimeNode(bool IsStart, const SDLoc &dl,
FoldingSetNodeID ID;
AddNodeIDNode(ID, Opcode, VTs, Ops);
ID.AddInteger(FrameIndex);
- ID.AddInteger(Size);
void *IP = nullptr;
if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
return SDValue(E, 0);
- LifetimeSDNode *N = newSDNode<LifetimeSDNode>(Opcode, dl.getIROrder(),
- dl.getDebugLoc(), VTs, Size);
+ LifetimeSDNode *N =
+ newSDNode<LifetimeSDNode>(Opcode, dl.getIROrder(), dl.getDebugLoc(), VTs);
createOperands(N, Ops);
CSEMap.InsertNode(N, IP);
InsertNode(N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 1636465..306e068 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -3923,11 +3923,15 @@ void SelectionDAGBuilder::visitFPTrunc(const User &I) {
// FPTrunc is never a no-op cast, no need to check
SDValue N = getValue(I.getOperand(0));
SDLoc dl = getCurSDLoc();
+ SDNodeFlags Flags;
+ if (auto *TruncInst = dyn_cast<FPMathOperator>(&I))
+ Flags.copyFMF(*TruncInst);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT DestVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
setValue(&I, DAG.getNode(ISD::FP_ROUND, dl, DestVT, N,
DAG.getTargetConstant(
- 0, dl, TLI.getPointerTy(DAG.getDataLayout()))));
+ 0, dl, TLI.getPointerTy(DAG.getDataLayout())),
+ Flags));
}
void SelectionDAGBuilder::visitFPExt(const User &I) {
@@ -7594,8 +7598,6 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
if (TM.getOptLevel() == CodeGenOptLevel::None)
return;
- const int64_t ObjectSize =
- cast<ConstantInt>(I.getArgOperand(0))->getSExtValue();
const AllocaInst *LifetimeObject = cast<AllocaInst>(I.getArgOperand(1));
// First check that the Alloca is static, otherwise it won't have a
@@ -7605,7 +7607,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
return;
const int FrameIndex = SI->second;
- Res = DAG.getLifetimeNode(IsStart, sdl, getRoot(), FrameIndex, ObjectSize);
+ Res = DAG.getLifetimeNode(IsStart, sdl, getRoot(), FrameIndex);
DAG.setRoot(Res);
return;
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 9474587..900da76 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -946,8 +946,6 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
<< " -> "
<< ASC->getDestAddressSpace()
<< ']';
- } else if (const LifetimeSDNode *LN = dyn_cast<LifetimeSDNode>(this)) {
- OS << "<0 to " << LN->getSize() << ">";
} else if (const auto *AA = dyn_cast<AssertAlignSDNode>(this)) {
OS << '<' << AA->getAlign().value() << '>';
}
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index d4a3455..68b8a00 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -806,7 +806,17 @@ void TargetLoweringBase::initActions() {
ISD::SDIVFIX, ISD::SDIVFIXSAT,
ISD::UDIVFIX, ISD::UDIVFIXSAT,
ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT,
- ISD::IS_FPCLASS},
+ ISD::IS_FPCLASS, ISD::FCBRT,
+ ISD::FLOG, ISD::FLOG2,
+ ISD::FLOG10, ISD::FEXP,
+ ISD::FEXP2, ISD::FEXP10,
+ ISD::FFLOOR, ISD::FNEARBYINT,
+ ISD::FCEIL, ISD::FRINT,
+ ISD::FTRUNC, ISD::FROUNDEVEN,
+ ISD::FTAN, ISD::FACOS,
+ ISD::FASIN, ISD::FATAN,
+ ISD::FCOSH, ISD::FSINH,
+ ISD::FTANH, ISD::FATAN2},
VT, Expand);
// Overflow operations default to expand
@@ -852,13 +862,12 @@ void TargetLoweringBase::initActions() {
// These operations default to expand for vector types.
if (VT.isVector())
- setOperationAction(
- {ISD::FCOPYSIGN, ISD::SIGN_EXTEND_INREG, ISD::ANY_EXTEND_VECTOR_INREG,
- ISD::SIGN_EXTEND_VECTOR_INREG, ISD::ZERO_EXTEND_VECTOR_INREG,
- ISD::SPLAT_VECTOR, ISD::LRINT, ISD::LLRINT, ISD::LROUND,
- ISD::LLROUND, ISD::FTAN, ISD::FACOS, ISD::FASIN, ISD::FATAN,
- ISD::FCOSH, ISD::FSINH, ISD::FTANH, ISD::FATAN2},
- VT, Expand);
+ setOperationAction({ISD::FCOPYSIGN, ISD::SIGN_EXTEND_INREG,
+ ISD::ANY_EXTEND_VECTOR_INREG,
+ ISD::SIGN_EXTEND_VECTOR_INREG,
+ ISD::ZERO_EXTEND_VECTOR_INREG, ISD::SPLAT_VECTOR,
+ ISD::LRINT, ISD::LLRINT, ISD::LROUND, ISD::LLROUND},
+ VT, Expand);
// Constrained floating-point operations default to expand.
#define DAG_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN) \
@@ -914,15 +923,6 @@ void TargetLoweringBase::initActions() {
{MVT::bf16, MVT::f16, MVT::f32, MVT::f64, MVT::f80, MVT::f128},
Expand);
- // These library functions default to expand.
- setOperationAction({ISD::FCBRT, ISD::FLOG, ISD::FLOG2, ISD::FLOG10,
- ISD::FEXP, ISD::FEXP2, ISD::FEXP10, ISD::FFLOOR,
- ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT, ISD::FTRUNC,
- ISD::FROUNDEVEN, ISD::FTAN, ISD::FACOS, ISD::FASIN,
- ISD::FATAN, ISD::FCOSH, ISD::FSINH, ISD::FTANH,
- ISD::FATAN2},
- {MVT::f32, MVT::f64, MVT::f128}, Expand);
-
// Insert custom handling default for llvm.canonicalize.*.
setOperationAction(ISD::FCANONICALIZE,
{MVT::f16, MVT::f32, MVT::f64, MVT::f128}, Expand);
diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 408d07b..725e951 100644
--- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -1734,7 +1734,8 @@ MCSection *TargetLoweringObjectFileCOFF::getExplicitSectionGlobal(
Name == getInstrProfSectionName(IPSK_covdata, Triple::COFF,
/*AddSegmentInfo=*/false) ||
Name == getInstrProfSectionName(IPSK_covname, Triple::COFF,
- /*AddSegmentInfo=*/false))
+ /*AddSegmentInfo=*/false) ||
+ Name == ".llvmbc" || Name == ".llvmcmd")
Kind = SectionKind::getMetadata();
int Selection = 0;
unsigned Characteristics = getCOFFSectionFlags(Kind, TM);
diff --git a/llvm/lib/CodeGen/WindowsSecureHotPatching.cpp b/llvm/lib/CodeGen/WindowsSecureHotPatching.cpp
index 6267207..fd54190 100644
--- a/llvm/lib/CodeGen/WindowsSecureHotPatching.cpp
+++ b/llvm/lib/CodeGen/WindowsSecureHotPatching.cpp
@@ -369,6 +369,19 @@ static GlobalVariable *getOrCreateRefVariable(
AddrOfOldGV, Twine("__ref_").concat(GV->getName()),
nullptr, GlobalVariable::NotThreadLocal);
+ // RefGV is created with isConstant = false, but we want to place RefGV into
+ // .rdata, not .data. It is important that the GlobalVariable be mutable
+ // from the compiler's point of view, so that the optimizer does not remove
+ // the global variable entirely and replace all references to it with its
+ // initial value.
+ //
+ // When the Windows hot-patch loader applies a hot-patch, it maps the
+ // pages of .rdata as read/write so that it can set each __ref_* variable
+ // to point to the original variable in the base image. Afterward, pages in
+ // .rdata are remapped as read-only. This protects the __ref_* variables from
+ // being overwritten during execution.
+ RefGV->setSection(".rdata");
+
// Create debug info for the replacement global variable.
DataLayout Layout = M->getDataLayout();
DIType *DebugType = DebugInfo.createPointerType(
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h b/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h
index bd0d72f..0e95369 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h
@@ -157,8 +157,7 @@ private:
processSubtractRelocation(unsigned SectionID, relocation_iterator RelI,
const MachOObjectFile &BaseObj,
ObjSectionToIDMap &ObjSectionToID) {
- const MachOObjectFile &Obj =
- static_cast<const MachOObjectFile&>(BaseObj);
+ const MachOObjectFile &Obj = BaseObj;
MachO::any_relocation_info RE =
Obj.getRelocation(RelI->getRawDataRefImpl());
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 840ca83..7928772 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -2617,7 +2617,7 @@ void OpenMPIRBuilder::emitReductionListCopy(
Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
AttributeList FuncAttrs) {
- IRBuilder<>::InsertPointGuard IPG(Builder);
+ InsertPointTy SavedIP = Builder.saveIP();
LLVMContext &Ctx = M.getContext();
FunctionType *FuncTy = FunctionType::get(
Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
@@ -2630,7 +2630,6 @@ Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
WcFunc->addParamAttr(1, Attribute::NoUndef);
BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
Builder.SetInsertPoint(EntryBB);
- Builder.SetCurrentDebugLocation(llvm::DebugLoc());
// ReduceList: thread local Reduce list.
// At the stage of the computation when this function is called, partially
@@ -2845,6 +2844,7 @@ Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
}
Builder.CreateRetVoid();
+ Builder.restoreIP(SavedIP);
return WcFunc;
}
@@ -2853,7 +2853,6 @@ Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
AttributeList FuncAttrs) {
LLVMContext &Ctx = M.getContext();
- IRBuilder<>::InsertPointGuard IPG(Builder);
FunctionType *FuncTy =
FunctionType::get(Builder.getVoidTy(),
{Builder.getPtrTy(), Builder.getInt16Ty(),
@@ -2872,7 +2871,6 @@ Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
SarFunc->addParamAttr(3, Attribute::SExt);
BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
Builder.SetInsertPoint(EntryBB);
- Builder.SetCurrentDebugLocation(llvm::DebugLoc());
// Thread local Reduce list used to host the values of data to be reduced.
Argument *ReduceListArg = SarFunc->getArg(0);
@@ -3019,7 +3017,7 @@ Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
AttributeList FuncAttrs) {
- IRBuilder<>::InsertPointGuard IPG(Builder);
+ OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
LLVMContext &Ctx = M.getContext();
FunctionType *FuncTy = FunctionType::get(
Builder.getVoidTy(),
@@ -3035,7 +3033,6 @@ Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
Builder.SetInsertPoint(EntryBlock);
- Builder.SetCurrentDebugLocation(llvm::DebugLoc());
// Buffer: global reduction buffer.
Argument *BufferArg = LtGCFunc->getArg(0);
@@ -3123,13 +3120,14 @@ Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
}
Builder.CreateRetVoid();
+ Builder.restoreIP(OldIP);
return LtGCFunc;
}
Function *OpenMPIRBuilder::emitListToGlobalReduceFunction(
ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
Type *ReductionsBufferTy, AttributeList FuncAttrs) {
- IRBuilder<>::InsertPointGuard IPG(Builder);
+ OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
LLVMContext &Ctx = M.getContext();
FunctionType *FuncTy = FunctionType::get(
Builder.getVoidTy(),
@@ -3145,7 +3143,6 @@ Function *OpenMPIRBuilder::emitListToGlobalReduceFunction(
BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
Builder.SetInsertPoint(EntryBlock);
- Builder.SetCurrentDebugLocation(llvm::DebugLoc());
// Buffer: global reduction buffer.
Argument *BufferArg = LtGRFunc->getArg(0);
@@ -3206,13 +3203,14 @@ Function *OpenMPIRBuilder::emitListToGlobalReduceFunction(
Builder.CreateCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
->addFnAttr(Attribute::NoUnwind);
Builder.CreateRetVoid();
+ Builder.restoreIP(OldIP);
return LtGRFunc;
}
Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
AttributeList FuncAttrs) {
- IRBuilder<>::InsertPointGuard IPG(Builder);
+ OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
LLVMContext &Ctx = M.getContext();
FunctionType *FuncTy = FunctionType::get(
Builder.getVoidTy(),
@@ -3228,7 +3226,6 @@ Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
Builder.SetInsertPoint(EntryBlock);
- Builder.SetCurrentDebugLocation(llvm::DebugLoc());
// Buffer: global reduction buffer.
Argument *BufferArg = LtGCFunc->getArg(0);
@@ -3314,13 +3311,14 @@ Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
}
Builder.CreateRetVoid();
+ Builder.restoreIP(OldIP);
return LtGCFunc;
}
Function *OpenMPIRBuilder::emitGlobalToListReduceFunction(
ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
Type *ReductionsBufferTy, AttributeList FuncAttrs) {
- IRBuilder<>::InsertPointGuard IPG(Builder);
+ OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
LLVMContext &Ctx = M.getContext();
auto *FuncTy = FunctionType::get(
Builder.getVoidTy(),
@@ -3336,7 +3334,6 @@ Function *OpenMPIRBuilder::emitGlobalToListReduceFunction(
BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
Builder.SetInsertPoint(EntryBlock);
- Builder.SetCurrentDebugLocation(llvm::DebugLoc());
// Buffer: global reduction buffer.
Argument *BufferArg = LtGRFunc->getArg(0);
@@ -3397,6 +3394,7 @@ Function *OpenMPIRBuilder::emitGlobalToListReduceFunction(
Builder.CreateCall(ReduceFn, {ReduceList, ReductionList})
->addFnAttr(Attribute::NoUnwind);
Builder.CreateRetVoid();
+ Builder.restoreIP(OldIP);
return LtGRFunc;
}
@@ -3409,7 +3407,6 @@ std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
Expected<Function *> OpenMPIRBuilder::createReductionFunction(
StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
ReductionGenCBKind ReductionGenCBKind, AttributeList FuncAttrs) {
- IRBuilder<>::InsertPointGuard IPG(Builder);
auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
{Builder.getPtrTy(), Builder.getPtrTy()},
/* IsVarArg */ false);
@@ -3422,7 +3419,6 @@ Expected<Function *> OpenMPIRBuilder::createReductionFunction(
BasicBlock *EntryBB =
BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
Builder.SetInsertPoint(EntryBB);
- Builder.SetCurrentDebugLocation(llvm::DebugLoc());
// Need to alloca memory here and deal with the pointers before getting
// LHS/RHS pointers out
@@ -3750,12 +3746,10 @@ static Error populateReductionFunction(
Function *ReductionFunc,
ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
IRBuilder<> &Builder, ArrayRef<bool> IsByRef, bool IsGPU) {
- IRBuilder<>::InsertPointGuard IPG(Builder);
Module *Module = ReductionFunc->getParent();
BasicBlock *ReductionFuncBlock =
BasicBlock::Create(Module->getContext(), "", ReductionFunc);
Builder.SetInsertPoint(ReductionFuncBlock);
- Builder.SetCurrentDebugLocation(llvm::DebugLoc());
Value *LHSArrayPtr = nullptr;
Value *RHSArrayPtr = nullptr;
if (IsGPU) {
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index 28037d7..49c6dc7 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -1144,9 +1144,32 @@ Value *IRBuilderBase::CreateVectorSplat(ElementCount EC, Value *V,
return CreateShuffleVector(V, Zeros, Name + ".splat");
}
-Value *IRBuilderBase::CreatePreserveArrayAccessIndex(
- Type *ElTy, Value *Base, unsigned Dimension, unsigned LastIndex,
- MDNode *DbgInfo) {
+Value *IRBuilderBase::CreateVectorInterleave(ArrayRef<Value *> Ops,
+ const Twine &Name) {
+ assert(Ops.size() >= 2 && Ops.size() <= 8 &&
+ "Unexpected number of operands to interleave");
+
+ // Make sure all operands are the same type.
+ assert(isa<VectorType>(Ops[0]->getType()) && "Unexpected type");
+
+#ifndef NDEBUG
+ for (unsigned I = 1; I < Ops.size(); I++) {
+ assert(Ops[I]->getType() == Ops[0]->getType() &&
+ "Vector interleave expects matching operand types!");
+ }
+#endif
+
+ unsigned IID = Intrinsic::getInterleaveIntrinsicID(Ops.size());
+ auto *SubvecTy = cast<VectorType>(Ops[0]->getType());
+ Type *DestTy = VectorType::get(SubvecTy->getElementType(),
+ SubvecTy->getElementCount() * Ops.size());
+ return CreateIntrinsic(IID, {DestTy}, Ops, {}, Name);
+}
+
+Value *IRBuilderBase::CreatePreserveArrayAccessIndex(Type *ElTy, Value *Base,
+ unsigned Dimension,
+ unsigned LastIndex,
+ MDNode *DbgInfo) {
auto *BaseType = Base->getType();
assert(isa<PointerType>(BaseType) &&
"Invalid Base ptr type for preserve.array.access.index.");
diff --git a/llvm/lib/IR/Intrinsics.cpp b/llvm/lib/IR/Intrinsics.cpp
index 6c35ade..58a1f74 100644
--- a/llvm/lib/IR/Intrinsics.cpp
+++ b/llvm/lib/IR/Intrinsics.cpp
@@ -1133,3 +1133,27 @@ std::optional<Function *> Intrinsic::remangleIntrinsicFunction(Function *F) {
"Shouldn't change the signature");
return NewDecl;
}
+
+struct InterleaveIntrinsic {
+ Intrinsic::ID Interleave, Deinterleave;
+};
+
+static InterleaveIntrinsic InterleaveIntrinsics[] = {
+ {Intrinsic::vector_interleave2, Intrinsic::vector_deinterleave2},
+ {Intrinsic::vector_interleave3, Intrinsic::vector_deinterleave3},
+ {Intrinsic::vector_interleave4, Intrinsic::vector_deinterleave4},
+ {Intrinsic::vector_interleave5, Intrinsic::vector_deinterleave5},
+ {Intrinsic::vector_interleave6, Intrinsic::vector_deinterleave6},
+ {Intrinsic::vector_interleave7, Intrinsic::vector_deinterleave7},
+ {Intrinsic::vector_interleave8, Intrinsic::vector_deinterleave8},
+};
+
+Intrinsic::ID Intrinsic::getInterleaveIntrinsicID(unsigned Factor) {
+ assert(Factor >= 2 && Factor <= 8 && "Unexpected factor");
+ return InterleaveIntrinsics[Factor - 2].Interleave;
+}
+
+Intrinsic::ID Intrinsic::getDeinterleaveIntrinsicID(unsigned Factor) {
+ assert(Factor >= 2 && Factor <= 8 && "Unexpected factor");
+ return InterleaveIntrinsics[Factor - 2].Deinterleave;
+}
diff --git a/llvm/lib/IR/Metadata.cpp b/llvm/lib/IR/Metadata.cpp
index 0dbd07f..1157cbe 100644
--- a/llvm/lib/IR/Metadata.cpp
+++ b/llvm/lib/IR/Metadata.cpp
@@ -1796,6 +1796,7 @@ AAMDNodes Instruction::getAAMetadata() const {
Result.TBAAStruct = Info.lookup(LLVMContext::MD_tbaa_struct);
Result.Scope = Info.lookup(LLVMContext::MD_alias_scope);
Result.NoAlias = Info.lookup(LLVMContext::MD_noalias);
+ Result.NoAliasAddrSpace = Info.lookup(LLVMContext::MD_noalias_addrspace);
}
return Result;
}
@@ -1805,6 +1806,7 @@ void Instruction::setAAMetadata(const AAMDNodes &N) {
setMetadata(LLVMContext::MD_tbaa_struct, N.TBAAStruct);
setMetadata(LLVMContext::MD_alias_scope, N.Scope);
setMetadata(LLVMContext::MD_noalias, N.NoAlias);
+ setMetadata(LLVMContext::MD_noalias_addrspace, N.NoAliasAddrSpace);
}
void Instruction::setNoSanitizeMetadata() {
diff --git a/llvm/lib/MC/MCMachOStreamer.cpp b/llvm/lib/MC/MCMachOStreamer.cpp
index 1074669..a214513 100644
--- a/llvm/lib/MC/MCMachOStreamer.cpp
+++ b/llvm/lib/MC/MCMachOStreamer.cpp
@@ -484,7 +484,8 @@ void MCMachOStreamer::finalizeCGProfile() {
// For each entry, reserve space for 2 32-bit indices and a 64-bit count.
size_t SectionBytes =
W.getCGProfile().size() * (2 * sizeof(uint32_t) + sizeof(uint64_t));
- (*CGProfileSection->begin()).appendContents(SectionBytes, 0);
+ (*CGProfileSection->begin())
+ .setVarContents(std::vector<char>(SectionBytes, 0));
}
MCStreamer *llvm::createMachOStreamer(MCContext &Context,
@@ -520,5 +521,6 @@ void MCMachOStreamer::createAddrSigSection() {
// (instead of emitting a zero-sized section) so these relocations are
// technically valid, even though we don't expect these relocations to
// actually be applied by the linker.
- Frag->appendContents(8, 0);
+ constexpr char zero[8] = {};
+ Frag->setVarContents(zero);
}
diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp
index f046552..e82393a 100644
--- a/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/llvm/lib/MC/MCObjectStreamer.cpp
@@ -46,23 +46,83 @@ MCAssembler *MCObjectStreamer::getAssemblerPtr() {
return nullptr;
}
+constexpr size_t FragBlockSize = 16384;
+// Ensure the new fragment can at least store a few bytes.
+constexpr size_t NewFragHeadroom = 8;
+
+static_assert(NewFragHeadroom >= alignof(MCFragment));
+static_assert(FragBlockSize >= sizeof(MCFragment) + NewFragHeadroom);
+
+MCFragment *MCObjectStreamer::allocFragSpace(size_t Headroom) {
+ auto Size = std::max(FragBlockSize, sizeof(MCFragment) + Headroom);
+ FragSpace = Size - sizeof(MCFragment);
+ auto Chunk = std::unique_ptr<char[]>(new char[Size]);
+ auto *F = reinterpret_cast<MCFragment *>(Chunk.get());
+ FragStorage.push_back(std::move(Chunk));
+ return F;
+}
+
void MCObjectStreamer::newFragment() {
- addFragment(getContext().allocFragment<MCFragment>());
+ MCFragment *F;
+ if (LLVM_LIKELY(sizeof(MCFragment) + NewFragHeadroom <= FragSpace)) {
+ auto End = reinterpret_cast<size_t>(getCurFragEnd());
+ F = reinterpret_cast<MCFragment *>(
+ alignToPowerOf2(End, alignof(MCFragment)));
+ FragSpace -= size_t(F) - End + sizeof(MCFragment);
+ } else {
+ F = allocFragSpace(0);
+ }
+ new (F) MCFragment();
+ addFragment(F);
}
-void MCObjectStreamer::insert(MCFragment *F) {
- assert(F->getKind() != MCFragment::FT_Data &&
+void MCObjectStreamer::ensureHeadroom(size_t Headroom) {
+ if (Headroom <= FragSpace)
+ return;
+ auto *F = allocFragSpace(Headroom);
+ new (F) MCFragment();
+ addFragment(F);
+}
+
+void MCObjectStreamer::insert(MCFragment *Frag) {
+ assert(Frag->getKind() != MCFragment::FT_Data &&
"F should have a variable-size tail");
+ // Frag is not connected to FragSpace. Before modifying CurFrag with
+ // addFragment(Frag), allocate an empty fragment to maintain FragSpace
+ // connectivity, potentially reusing CurFrag's associated space.
+ MCFragment *F;
+ if (LLVM_LIKELY(sizeof(MCFragment) + NewFragHeadroom <= FragSpace)) {
+ auto End = reinterpret_cast<size_t>(getCurFragEnd());
+ F = reinterpret_cast<MCFragment *>(
+ alignToPowerOf2(End, alignof(MCFragment)));
+ FragSpace -= size_t(F) - End + sizeof(MCFragment);
+ } else {
+ F = allocFragSpace(0);
+ }
+ new (F) MCFragment();
+
+ addFragment(Frag);
addFragment(F);
- newFragment();
+}
+
+void MCObjectStreamer::appendContents(ArrayRef<char> Contents) {
+ ensureHeadroom(Contents.size());
+ assert(FragSpace >= Contents.size());
+ llvm::copy(Contents, getCurFragEnd());
+ CurFrag->FixedSize += Contents.size();
+ FragSpace -= Contents.size();
}
void MCObjectStreamer::appendContents(size_t Num, char Elt) {
- CurFrag->appendContents(Num, Elt);
+ ensureHeadroom(Num);
+ MutableArrayRef<char> Data(getCurFragEnd(), Num);
+ llvm::fill(Data, Elt);
+ CurFrag->FixedSize += Num;
+ FragSpace -= Num;
}
void MCObjectStreamer::addFixup(const MCExpr *Value, MCFixupKind Kind) {
- CurFrag->addFixup(MCFixup::create(CurFrag->getFixedSize(), Value, Kind));
+ CurFrag->addFixup(MCFixup::create(getCurFragSize(), Value, Kind));
}
// As a compile-time optimization, avoid allocating and evaluating an MCExpr
@@ -111,6 +171,8 @@ void MCObjectStreamer::reset() {
}
EmitEHFrame = true;
EmitDebugFrame = false;
+ FragStorage.clear();
+ FragSpace = 0;
MCStreamer::reset();
}
@@ -139,7 +201,6 @@ void MCObjectStreamer::emitCFISections(bool EH, bool Debug, bool SFrame) {
void MCObjectStreamer::emitValueImpl(const MCExpr *Value, unsigned Size,
SMLoc Loc) {
MCStreamer::emitValueImpl(Value, Size, Loc);
- MCFragment *DF = getCurrentFragment();
MCDwarfLineEntry::make(this, getCurrentSectionOnly());
@@ -154,9 +215,9 @@ void MCObjectStreamer::emitValueImpl(const MCExpr *Value, unsigned Size,
emitIntValue(AbsValue, Size);
return;
}
- DF->addFixup(MCFixup::create(DF->getContents().size(), Value,
- MCFixup::getDataKindForSize(Size)));
- DF->appendContents(Size, 0);
+ ensureHeadroom(Size);
+ addFixup(Value, MCFixup::getDataKindForSize(Size));
+ appendContents(Size, 0);
}
MCSymbol *MCObjectStreamer::emitCFILabel() {
@@ -190,7 +251,7 @@ void MCObjectStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) {
// section.
MCFragment *F = CurFrag;
Symbol->setFragment(F);
- Symbol->setOffset(F->getContents().size());
+ Symbol->setOffset(F->getFixedSize());
emitPendingAssignments(Symbol);
}
@@ -256,6 +317,21 @@ void MCObjectStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
F0 = CurFrag;
}
+ // To maintain connectivity between CurFrag and FragSpace when CurFrag is
+ // modified, allocate an empty fragment and append it to the fragment list.
+ // (Subsections[I].second.Tail is not connected to FragSpace.)
+ MCFragment *F;
+ if (LLVM_LIKELY(sizeof(MCFragment) + NewFragHeadroom <= FragSpace)) {
+ auto End = reinterpret_cast<size_t>(getCurFragEnd());
+ F = reinterpret_cast<MCFragment *>(
+ alignToPowerOf2(End, alignof(MCFragment)));
+ FragSpace -= size_t(F) - End + sizeof(MCFragment);
+ } else {
+ F = allocFragSpace(0);
+ }
+ new (F) MCFragment();
+ F->setParent(Section);
+
auto &Subsections = Section->Subsections;
size_t I = 0, E = Subsections.size();
while (I != E && Subsections[I].first < Subsection)
@@ -263,13 +339,16 @@ void MCObjectStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
// If the subsection number is not in the sorted Subsections list, create a
// new fragment list.
if (I == E || Subsections[I].first != Subsection) {
- auto *F = getContext().allocFragment<MCFragment>();
- F->setParent(Section);
Subsections.insert(Subsections.begin() + I,
{Subsection, MCSection::FragList{F, F}});
+ Section->CurFragList = &Subsections[I].second;
+ CurFrag = F;
+ } else {
+ Section->CurFragList = &Subsections[I].second;
+ CurFrag = Subsections[I].second.Tail;
+ // Ensure CurFrag is associated with FragSpace.
+ addFragment(F);
}
- Section->CurFragList = &Subsections[I].second;
- CurFrag = Section->CurFragList->Tail;
// Define the section symbol at subsection 0's initial fragment if required.
if (!NewSec)
@@ -340,31 +419,37 @@ void MCObjectStreamer::emitInstToData(const MCInst &Inst,
MCFragment *F = getCurrentFragment();
// Append the instruction to the data fragment.
- size_t FixupStartIndex = F->getFixups().size();
- size_t CodeOffset = F->getContents().size();
+ size_t CodeOffset = getCurFragSize();
+ SmallString<16> Content;
SmallVector<MCFixup, 1> Fixups;
- getAssembler().getEmitter().encodeInstruction(
- Inst, F->getContentsForAppending(), Fixups, STI);
- F->doneAppending();
- if (!Fixups.empty())
- F->appendFixups(Fixups);
+ getAssembler().getEmitter().encodeInstruction(Inst, Content, Fixups, STI);
+ appendContents(Content);
+ if (CurFrag != F) {
+ F = CurFrag;
+ CodeOffset = 0;
+ }
F->setHasInstructions(STI);
+ if (Fixups.empty())
+ return;
bool MarkedLinkerRelaxable = false;
- for (auto &Fixup : MutableArrayRef(F->getFixups()).slice(FixupStartIndex)) {
+ for (auto &Fixup : Fixups) {
Fixup.setOffset(Fixup.getOffset() + CodeOffset);
- if (!Fixup.isLinkerRelaxable())
+ if (!Fixup.isLinkerRelaxable() || MarkedLinkerRelaxable)
continue;
- F->setLinkerRelaxable();
+ MarkedLinkerRelaxable = true;
+ // Set the fragment's order within the subsection for use by
+ // MCAssembler::relaxAlign.
+ auto *Sec = F->getParent();
+ if (!Sec->isLinkerRelaxable())
+ Sec->setLinkerRelaxable();
// Do not add data after a linker-relaxable instruction. The difference
// between a new label and a label at or before the linker-relaxable
// instruction cannot be resolved at assemble-time.
- if (!MarkedLinkerRelaxable) {
- MarkedLinkerRelaxable = true;
- getCurrentSectionOnly()->setLinkerRelaxable();
- newFragment();
- }
+ F->setLinkerRelaxable();
+ newFragment();
}
+ F->appendFixups(Fixups);
}
void MCObjectStreamer::emitInstToFragment(const MCInst &Inst,
@@ -536,8 +621,7 @@ void MCObjectStreamer::emitCVFileChecksumOffsetDirective(unsigned FileNo) {
void MCObjectStreamer::emitBytes(StringRef Data) {
MCDwarfLineEntry::make(this, getCurrentSectionOnly());
- MCFragment *DF = getCurrentFragment();
- DF->appendContents(ArrayRef(Data.data(), Data.size()));
+ appendContents(ArrayRef(Data.data(), Data.size()));
}
void MCObjectStreamer::emitValueToAlignment(Align Alignment, int64_t Fill,
diff --git a/llvm/lib/MC/MCWin64EH.cpp b/llvm/lib/MC/MCWin64EH.cpp
index 72a8dd7..a87648a 100644
--- a/llvm/lib/MC/MCWin64EH.cpp
+++ b/llvm/lib/MC/MCWin64EH.cpp
@@ -318,6 +318,9 @@ static void EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) {
// Emit the epilog instructions.
if (EnableUnwindV2) {
+ // Ensure the fixups and appended content apply to the same fragment.
+ OS->ensureHeadroom(info->EpilogMap.size() * 2);
+
bool IsLast = true;
for (const auto &Epilog : llvm::reverse(info->EpilogMap)) {
if (IsLast) {
diff --git a/llvm/lib/MC/MCWinCOFFStreamer.cpp b/llvm/lib/MC/MCWinCOFFStreamer.cpp
index 1ffe25c..8be5054 100644
--- a/llvm/lib/MC/MCWinCOFFStreamer.cpp
+++ b/llvm/lib/MC/MCWinCOFFStreamer.cpp
@@ -280,6 +280,7 @@ void MCWinCOFFStreamer::emitCOFFSymbolIndex(MCSymbol const *Symbol) {
void MCWinCOFFStreamer::emitCOFFSectionIndex(const MCSymbol *Symbol) {
visitUsedSymbol(*Symbol);
const MCSymbolRefExpr *SRE = MCSymbolRefExpr::create(Symbol, getContext());
+ ensureHeadroom(2);
addFixup(SRE, FK_SecRel_2);
appendContents(2, 0);
}
@@ -293,6 +294,7 @@ void MCWinCOFFStreamer::emitCOFFSecRel32(const MCSymbol *Symbol,
if (Offset)
MCE = MCBinaryExpr::createAdd(
MCE, MCConstantExpr::create(Offset, getContext()), getContext());
+ ensureHeadroom(4);
addFixup(MCE, FK_SecRel_4);
// Emit 4 bytes (zeros) to the object file.
appendContents(4, 0);
@@ -308,6 +310,7 @@ void MCWinCOFFStreamer::emitCOFFImgRel32(const MCSymbol *Symbol,
if (Offset)
MCE = MCBinaryExpr::createAdd(
MCE, MCConstantExpr::create(Offset, getContext()), getContext());
+ ensureHeadroom(4);
addFixup(MCE, FK_Data_4);
// Emit 4 bytes (zeros) to the object file.
appendContents(4, 0);
@@ -318,6 +321,7 @@ void MCWinCOFFStreamer::emitCOFFSecNumber(MCSymbol const *Symbol) {
// Create Symbol for section number.
const MCExpr *MCE = MCCOFFSectionNumberTargetExpr::create(
*Symbol, this->getWriter(), getContext());
+ ensureHeadroom(4);
addFixup(MCE, FK_Data_4);
// Emit 4 bytes (zeros) to the object file.
appendContents(4, 0);
@@ -328,6 +332,7 @@ void MCWinCOFFStreamer::emitCOFFSecOffset(MCSymbol const *Symbol) {
// Create Symbol for section offset.
const MCExpr *MCE =
MCCOFFSectionOffsetTargetExpr::create(*Symbol, getContext());
+ ensureHeadroom(4);
addFixup(MCE, FK_Data_4);
// Emit 4 bytes (zeros) to the object file.
appendContents(4, 0);
diff --git a/llvm/lib/MC/MCXCOFFStreamer.cpp b/llvm/lib/MC/MCXCOFFStreamer.cpp
index 898ac5d..26f45ce 100644
--- a/llvm/lib/MC/MCXCOFFStreamer.cpp
+++ b/llvm/lib/MC/MCXCOFFStreamer.cpp
@@ -103,16 +103,8 @@ void MCXCOFFStreamer::emitXCOFFSymbolLinkageWithVisibility(
void MCXCOFFStreamer::emitXCOFFRefDirective(const MCSymbol *Symbol) {
// Add a Fixup here to later record a relocation of type R_REF to prevent the
// ref symbol from being garbage collected (by the binder).
- MCFragment *DF = getCurrentFragment();
- const MCSymbolRefExpr *SRE = MCSymbolRefExpr::create(Symbol, getContext());
- std::optional<MCFixupKind> MaybeKind =
- getAssembler().getBackend().getFixupKind("R_REF");
- if (!MaybeKind)
- report_fatal_error("failed to get fixup kind for R_REF relocation");
-
- MCFixupKind Kind = *MaybeKind;
- MCFixup Fixup = MCFixup::create(DF->getContents().size(), SRE, Kind);
- DF->addFixup(Fixup);
+ addFixup(MCSymbolRefExpr::create(Symbol, getContext()),
+ XCOFF::RelocationType::R_REF);
}
void MCXCOFFStreamer::emitXCOFFRenameDirective(const MCSymbol *Name,
diff --git a/llvm/lib/MC/MachObjectWriter.cpp b/llvm/lib/MC/MachObjectWriter.cpp
index 7b5c3c0..e87696a 100644
--- a/llvm/lib/MC/MachObjectWriter.cpp
+++ b/llvm/lib/MC/MachObjectWriter.cpp
@@ -806,7 +806,7 @@ uint64_t MachObjectWriter::writeObject() {
}
MCSection *Sec = getContext().getMachOSection("__LLVM", "__cg_profile", 0,
SectionKind::getMetadata());
- llvm::copy(OS.str(), Sec->curFragList()->Head->getContents().data());
+ llvm::copy(OS.str(), Sec->curFragList()->Head->getVarContents().data());
}
unsigned NumSections = Asm.end() - Asm.begin();
diff --git a/llvm/lib/Object/IRSymtab.cpp b/llvm/lib/Object/IRSymtab.cpp
index 2579fa3..0f19495 100644
--- a/llvm/lib/Object/IRSymtab.cpp
+++ b/llvm/lib/Object/IRSymtab.cpp
@@ -8,11 +8,11 @@
#include "llvm/Object/IRSymtab.h"
#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSet.h"
#include "llvm/Bitcode/BitcodeReader.h"
#include "llvm/Config/llvm-config.h"
#include "llvm/IR/Comdat.h"
@@ -213,9 +213,10 @@ Expected<int> Builder::getComdatIndex(const Comdat *C, const Module *M) {
return P.first->second;
}
-static DenseSet<StringRef> buildPreservedSymbolsSet(const Triple &TT) {
- DenseSet<StringRef> PreservedSymbolSet(std::begin(PreservedSymbols),
- std::end(PreservedSymbols));
+static StringSet<> buildPreservedSymbolsSet(const Triple &TT) {
+ StringSet<> PreservedSymbolSet;
+ PreservedSymbolSet.insert(std::begin(PreservedSymbols),
+ std::end(PreservedSymbols));
// FIXME: Do we need to pass in ABI fields from TargetOptions?
RTLIB::RuntimeLibcallsInfo Libcalls(TT);
for (RTLIB::LibcallImpl Impl : Libcalls.getLibcallImpls()) {
@@ -280,7 +281,7 @@ Error Builder::addSymbol(const ModuleSymbolTable &Msymtab,
setStr(Sym.IRName, GV->getName());
- static const DenseSet<StringRef> PreservedSymbolsSet =
+ static const StringSet<> PreservedSymbolsSet =
buildPreservedSymbolsSet(GV->getParent()->getTargetTriple());
bool IsPreservedSymbol = PreservedSymbolsSet.contains(GV->getName());
diff --git a/llvm/lib/Object/SFrameParser.cpp b/llvm/lib/Object/SFrameParser.cpp
index 2d74d1d..6f0037c 100644
--- a/llvm/lib/Object/SFrameParser.cpp
+++ b/llvm/lib/Object/SFrameParser.cpp
@@ -51,5 +51,6 @@ Expected<SFrameParser<E>> SFrameParser<E>::create(ArrayRef<uint8_t> Contents) {
return SFrameParser(Contents, *Header);
}
-template class llvm::object::SFrameParser<endianness::big>;
-template class llvm::object::SFrameParser<endianness::little>;
+template class LLVM_EXPORT_TEMPLATE llvm::object::SFrameParser<endianness::big>;
+template class LLVM_EXPORT_TEMPLATE
+ llvm::object::SFrameParser<endianness::little>;
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index fd89583..1b111dc 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -84,6 +84,7 @@ MODULE_PASS("global-merge-func", GlobalMergeFuncPass())
MODULE_PASS("globalopt", GlobalOptPass())
MODULE_PASS("globalsplit", GlobalSplitPass())
MODULE_PASS("hipstdpar-interpose-alloc", HipStdParAllocationInterpositionPass())
+MODULE_PASS("hipstdpar-math-fixup", HipStdParMathFixupPass())
MODULE_PASS("hipstdpar-select-accelerator-code",
HipStdParAcceleratorCodeSelectionPass())
MODULE_PASS("hotcoldsplit", HotColdSplittingPass())
diff --git a/llvm/lib/Support/Debug.cpp b/llvm/lib/Support/Debug.cpp
index 5bb04d0..b6f338f 100644
--- a/llvm/lib/Support/Debug.cpp
+++ b/llvm/lib/Support/Debug.cpp
@@ -24,11 +24,13 @@
//===----------------------------------------------------------------------===//
#include "llvm/Support/Debug.h"
+#include "llvm/ADT/StringExtras.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ManagedStatic.h"
#include "llvm/Support/Signals.h"
#include "llvm/Support/circular_raw_ostream.h"
#include "llvm/Support/raw_ostream.h"
+#include <utility>
#include "DebugOptions.h"
@@ -38,27 +40,62 @@
using namespace llvm;
+/// Parse a debug type string into a pair of the debug type and the debug level.
+/// The expected format is "type[:level]", where the level is an optional
+/// integer.
+static std::pair<std::string, std::optional<int>>
+parseDebugType(StringRef DbgType) {
+ std::optional<int> Level;
+ size_t ColonPos = DbgType.find(':');
+ if (ColonPos != StringRef::npos) {
+ StringRef LevelStr = DbgType.substr(ColonPos + 1);
+ DbgType = DbgType.take_front(ColonPos);
+ if (LevelStr.empty())
+ Level = 0;
+ else {
+ int parsedLevel;
+ if (to_integer(LevelStr, parsedLevel, 10))
+ Level = parsedLevel;
+ }
+ }
+ return std::make_pair(DbgType.str(), Level);
+}
+
// Even though LLVM might be built with NDEBUG, define symbols that the code
// built without NDEBUG can depend on via the llvm/Support/Debug.h header.
namespace llvm {
/// Exported boolean set by the -debug option.
bool DebugFlag = false;
-static ManagedStatic<std::vector<std::string>> CurrentDebugType;
+/// The current debug type and an optional debug level.
+/// The debug level is the verbosity of the debug output.
+/// 0 is a special level that acts as an opt-out for this specific debug type.
+/// If provided, the debug output is enabled only if the user specified a level
+/// at least as high as the provided level.
+static ManagedStatic<std::vector<std::pair<std::string, std::optional<int>>>>
+ CurrentDebugType;
/// Return true if the specified string is the debug type
/// specified on the command line, or if none was specified on the command line
/// with the -debug-only=X option.
-bool isCurrentDebugType(const char *DebugType) {
+bool isCurrentDebugType(const char *DebugType, int Level) {
if (CurrentDebugType->empty())
return true;
+ // Track if there is at least one debug type with a level, this is used
+ // to allow to opt-out of some DebugType and leaving all the others enabled.
+ bool HasEnabledDebugType = false;
// See if DebugType is in list. Note: do not use find() as that forces us to
// unnecessarily create an std::string instance.
- for (auto &d : *CurrentDebugType) {
- if (d == DebugType)
+ for (auto &D : *CurrentDebugType) {
+ HasEnabledDebugType =
+ HasEnabledDebugType || (!D.second.has_value() || D.second.value() > 0);
+ if (D.first != DebugType)
+ continue;
+ if (!D.second.has_value())
return true;
+ return D.second >= Level;
}
- return false;
+ return !HasEnabledDebugType;
}
/// Set the current debug type, as if the -debug-only=X
@@ -73,8 +110,11 @@ void setCurrentDebugType(const char *Type) {
void setCurrentDebugTypes(const char **Types, unsigned Count) {
CurrentDebugType->clear();
- llvm::append_range(*CurrentDebugType, ArrayRef(Types, Count));
+ CurrentDebugType->reserve(Count);
+ for (const char *Type : ArrayRef(Types, Count))
+ CurrentDebugType->push_back(parseDebugType(Type));
}
+
} // namespace llvm
// All Debug.h functionality is a no-op in NDEBUG mode.
@@ -114,10 +154,10 @@ struct DebugOnlyOpt {
if (Val.empty())
return;
DebugFlag = true;
- SmallVector<StringRef,8> dbgTypes;
- StringRef(Val).split(dbgTypes, ',', -1, false);
- for (auto dbgType : dbgTypes)
- CurrentDebugType->push_back(std::string(dbgType));
+ SmallVector<StringRef, 8> DbgTypes;
+ StringRef(Val).split(DbgTypes, ',', -1, false);
+ for (auto DbgType : DbgTypes)
+ CurrentDebugType->push_back(parseDebugType(DbgType));
}
};
} // namespace
@@ -129,8 +169,13 @@ struct CreateDebugOnly {
static void *call() {
return new cl::opt<DebugOnlyOpt, true, cl::parser<std::string>>(
"debug-only",
- cl::desc("Enable a specific type of debug output (comma separated list "
- "of types)"),
+ cl::desc(
+ "Enable a specific type of debug output (comma separated list "
+ "of types using the format \"type[:level]\", where the level "
+ "is an optional integer. The level can be set to 1, 2, 3, etc. to "
+ "control the verbosity of the output. Setting a debug-type level "
+ "to zero acts as an opt-out for this specific debug-type without "
+ "affecting the others."),
cl::Hidden, cl::value_desc("debug string"),
cl::location(DebugOnlyOptLoc), cl::ValueRequired);
}
diff --git a/llvm/lib/Support/Unix/Path.inc b/llvm/lib/Support/Unix/Path.inc
index 277247e..cc02cae 100644
--- a/llvm/lib/Support/Unix/Path.inc
+++ b/llvm/lib/Support/Unix/Path.inc
@@ -1190,7 +1190,7 @@ Expected<size_t> readNativeFile(file_t FD, MutableArrayRef<char> Buf) {
size_t Size = Buf.size();
#endif
ssize_t NumRead = sys::RetryAfterSignal(-1, ::read, FD, Buf.data(), Size);
- if (ssize_t(NumRead) == -1)
+ if (NumRead == -1)
return errorCodeToError(errnoAsErrorCode());
// The underlying operation on these platforms allow opening directories
// for reading in more cases than other platforms.
diff --git a/llvm/lib/Support/Windows/Threading.inc b/llvm/lib/Support/Windows/Threading.inc
index d862dbd..8dd7c88 100644
--- a/llvm/lib/Support/Windows/Threading.inc
+++ b/llvm/lib/Support/Windows/Threading.inc
@@ -106,7 +106,67 @@ void llvm::get_thread_name(SmallVectorImpl<char> &Name) {
Name.clear();
}
+namespace llvm::sys::windows {
+HMODULE loadSystemModuleSecure(LPCWSTR lpModuleName) {
+ // Ensure we load indeed a module from system32 path.
+ // As per GetModuleHandle documentation:
+ // "If lpModuleName does not include a path and there is more than one loaded
+ // module with the same base name and extension, you cannot predict which
+ // module handle will be returned.". This mitigates
+ // https://learn.microsoft.com/en-us/security-updates/securityadvisories/2010/2269637
+ SmallVector<wchar_t, MAX_PATH> Buf;
+ size_t Size = MAX_PATH;
+ do {
+ Buf.resize_for_overwrite(Size);
+ SetLastError(NO_ERROR);
+ Size = ::GetSystemDirectoryW(Buf.data(), Buf.size());
+ if (Size == 0)
+ return NULL;
+
+ // Try again with larger buffer.
+ } while (Size > Buf.size());
+
+ Buf.truncate(Size);
+ Buf.push_back(L'\\');
+ Buf.append(lpModuleName, lpModuleName + std::wcslen(lpModuleName));
+ Buf.push_back(0);
+
+ return ::GetModuleHandleW(Buf.data());
+}
+} // namespace llvm::sys::windows
+
SetThreadPriorityResult llvm::set_thread_priority(ThreadPriority Priority) {
+ HMODULE kernelM = llvm::sys::windows::loadSystemModuleSecure(L"kernel32.dll");
+ if (kernelM) {
+ // SetThreadInformation is only available on Windows 8 and later. Since we
+ // still support compilation on Windows 7, we load the function dynamically.
+ typedef BOOL(WINAPI * SetThreadInformation_t)(
+ HANDLE hThread, THREAD_INFORMATION_CLASS ThreadInformationClass,
+ _In_reads_bytes_(ThreadInformationSize) PVOID ThreadInformation,
+ ULONG ThreadInformationSize);
+ static const auto pfnSetThreadInformation =
+ (SetThreadInformation_t)::GetProcAddress(kernelM,
+ "SetThreadInformation");
+ if (pfnSetThreadInformation) {
+ auto setThreadInformation = [](ULONG ControlMaskAndStateMask) {
+ THREAD_POWER_THROTTLING_STATE state{};
+ state.Version = THREAD_POWER_THROTTLING_CURRENT_VERSION;
+ state.ControlMask = ControlMaskAndStateMask;
+ state.StateMask = ControlMaskAndStateMask;
+ return pfnSetThreadInformation(
+ ::GetCurrentThread(), ThreadPowerThrottling, &state, sizeof(state));
+ };
+
+ // Use EcoQoS for ThreadPriority::Background available (running on most
+ // efficent cores at the most efficient cpu frequency):
+ // https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-setthreadinformation
+ // https://learn.microsoft.com/en-us/windows/win32/procthread/quality-of-service
+ setThreadInformation(Priority == ThreadPriority::Background
+ ? THREAD_POWER_THROTTLING_EXECUTION_SPEED
+ : 0);
+ }
+ }
+
// https://docs.microsoft.com/en-us/windows/desktop/api/processthreadsapi/nf-processthreadsapi-setthreadpriority
// Begin background processing mode. The system lowers the resource scheduling
// priorities of the thread so that it can perform background work without
diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp
index 1f3e5dc..3f318e2 100644
--- a/llvm/lib/TableGen/Record.cpp
+++ b/llvm/lib/TableGen/Record.cpp
@@ -985,6 +985,12 @@ const Init *UnOpInit::Fold(const Record *CurRec, bool IsFinal) const {
}
break;
+ case GETDAGOPNAME:
+ if (const auto *Dag = dyn_cast<DagInit>(LHS)) {
+ return Dag->getName();
+ }
+ break;
+
case LOG2:
if (const auto *LHSi = dyn_cast_or_null<IntInit>(
LHS->convertInitializerTo(IntRecTy::get(RK)))) {
@@ -1050,6 +1056,9 @@ std::string UnOpInit::getAsString() const {
case SIZE: Result = "!size"; break;
case EMPTY: Result = "!empty"; break;
case GETDAGOP: Result = "!getdagop"; break;
+ case GETDAGOPNAME:
+ Result = "!getdagopname";
+ break;
case LOG2 : Result = "!logtwo"; break;
case LISTFLATTEN:
Result = "!listflatten";
@@ -1310,7 +1319,11 @@ const Init *BinOpInit::Fold(const Record *CurRec) const {
SmallVector<std::pair<const Init *, const StringInit *>, 8> Args;
llvm::append_range(Args, LHSs->getArgAndNames());
llvm::append_range(Args, RHSs->getArgAndNames());
- return DagInit::get(Op, Args);
+ // Use the name of the LHS DAG if it's set, otherwise the name of the RHS.
+ const auto *NameInit = LHSs->getName();
+ if (!NameInit)
+ NameInit = RHSs->getName();
+ return DagInit::get(Op, NameInit, Args);
}
break;
}
@@ -1508,6 +1521,14 @@ const Init *BinOpInit::Fold(const Record *CurRec) const {
return DagInit::get(Op, Dag->getArgs(), Dag->getArgNames());
break;
}
+ case SETDAGOPNAME: {
+ const auto *Dag = dyn_cast<DagInit>(LHS);
+ const auto *Op = dyn_cast<StringInit>(RHS);
+ if (Dag && Op)
+ return DagInit::get(Dag->getOperator(), Op, Dag->getArgs(),
+ Dag->getArgNames());
+ break;
+ }
case ADD:
case SUB:
case MUL:
@@ -1620,6 +1641,9 @@ std::string BinOpInit::getAsString() const {
case STRCONCAT: Result = "!strconcat"; break;
case INTERLEAVE: Result = "!interleave"; break;
case SETDAGOP: Result = "!setdagop"; break;
+ case SETDAGOPNAME:
+ Result = "!setdagopname";
+ break;
case GETDAGARG:
Result = "!getdagarg<" + getType()->getAsString() + ">";
break;
diff --git a/llvm/lib/TableGen/TGLexer.cpp b/llvm/lib/TableGen/TGLexer.cpp
index aea1bb0..c369916 100644
--- a/llvm/lib/TableGen/TGLexer.cpp
+++ b/llvm/lib/TableGen/TGLexer.cpp
@@ -680,6 +680,8 @@ tgtok::TokKind TGLexer::LexExclaim() {
.Case("find", tgtok::XFind)
.Cases("setdagop", "setop", tgtok::XSetDagOp) // !setop is deprecated.
.Cases("getdagop", "getop", tgtok::XGetDagOp) // !getop is deprecated.
+ .Case("setdagopname", tgtok::XSetDagOpName)
+ .Case("getdagopname", tgtok::XGetDagOpName)
.Case("getdagarg", tgtok::XGetDagArg)
.Case("getdagname", tgtok::XGetDagName)
.Case("setdagarg", tgtok::XSetDagArg)
diff --git a/llvm/lib/TableGen/TGLexer.h b/llvm/lib/TableGen/TGLexer.h
index ed7d8f3..5725e39 100644
--- a/llvm/lib/TableGen/TGLexer.h
+++ b/llvm/lib/TableGen/TGLexer.h
@@ -150,6 +150,8 @@ enum TokKind {
XGt,
XSetDagOp,
XGetDagOp,
+ XSetDagOpName,
+ XGetDagOpName,
XExists,
XListRemove,
XToLower,
diff --git a/llvm/lib/TableGen/TGParser.cpp b/llvm/lib/TableGen/TGParser.cpp
index 62c5355..81b61b1 100644
--- a/llvm/lib/TableGen/TGParser.cpp
+++ b/llvm/lib/TableGen/TGParser.cpp
@@ -11,6 +11,7 @@
//===----------------------------------------------------------------------===//
#include "TGParser.h"
+#include "TGLexer.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/Twine.h"
@@ -1199,6 +1200,7 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) {
case tgtok::XCast:
case tgtok::XRepr:
case tgtok::XGetDagOp:
+ case tgtok::XGetDagOpName:
case tgtok::XInitialized: { // Value ::= !unop '(' Value ')'
UnOpInit::UnaryOp Code;
const RecTy *Type = nullptr;
@@ -1287,6 +1289,11 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) {
}
Code = UnOpInit::GETDAGOP;
break;
+ case tgtok::XGetDagOpName:
+ Lex.Lex(); // eat the operation
+ Type = StringRecTy::get(Records);
+ Code = UnOpInit::GETDAGOPNAME;
+ break;
case tgtok::XInitialized:
Lex.Lex(); // eat the operation
Code = UnOpInit::INITIALIZED;
@@ -1514,7 +1521,8 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) {
case tgtok::XInterleave:
case tgtok::XGetDagArg:
case tgtok::XGetDagName:
- case tgtok::XSetDagOp: { // Value ::= !binop '(' Value ',' Value ')'
+ case tgtok::XSetDagOp:
+ case tgtok::XSetDagOpName: { // Value ::= !binop '(' Value ',' Value ')'
tgtok::TokKind OpTok = Lex.getCode();
SMLoc OpLoc = Lex.getLoc();
Lex.Lex(); // eat the operation
@@ -1550,6 +1558,9 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) {
case tgtok::XStrConcat: Code = BinOpInit::STRCONCAT; break;
case tgtok::XInterleave: Code = BinOpInit::INTERLEAVE; break;
case tgtok::XSetDagOp: Code = BinOpInit::SETDAGOP; break;
+ case tgtok::XSetDagOpName:
+ Code = BinOpInit::SETDAGOPNAME;
+ break;
case tgtok::XGetDagArg:
Code = BinOpInit::GETDAGARG;
break;
@@ -1580,6 +1591,10 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) {
}
ArgType = DagRecTy::get(Records);
break;
+ case tgtok::XSetDagOpName:
+ Type = DagRecTy::get(Records);
+ ArgType = DagRecTy::get(Records);
+ break;
case tgtok::XGetDagName:
Type = StringRecTy::get(Records);
ArgType = DagRecTy::get(Records);
@@ -1773,22 +1788,26 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) {
// Deal with BinOps whose arguments have different types, by
// rewriting ArgType in between them.
switch (Code) {
- case BinOpInit::SETDAGOP:
- // After parsing the first dag argument, switch to expecting
- // a record, with no restriction on its superclasses.
- ArgType = RecordRecTy::get(Records, {});
- break;
- case BinOpInit::GETDAGARG:
- // After parsing the first dag argument, expect an index integer or a
- // name string.
- ArgType = nullptr;
- break;
- case BinOpInit::GETDAGNAME:
- // After parsing the first dag argument, expect an index integer.
- ArgType = IntRecTy::get(Records);
- break;
- default:
- break;
+ case BinOpInit::SETDAGOPNAME:
+ // After parsing the first dag argument, expect a string.
+ ArgType = StringRecTy::get(Records);
+ break;
+ case BinOpInit::SETDAGOP:
+ // After parsing the first dag argument, switch to expecting
+ // a record, with no restriction on its superclasses.
+ ArgType = RecordRecTy::get(Records, {});
+ break;
+ case BinOpInit::GETDAGARG:
+ // After parsing the first dag argument, expect an index integer or a
+ // name string.
+ ArgType = nullptr;
+ break;
+ case BinOpInit::GETDAGNAME:
+ // After parsing the first dag argument, expect an index integer.
+ ArgType = IntRecTy::get(Records);
+ break;
+ default:
+ break;
}
if (!consume(tgtok::comma))
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7b49754..7c9fc67 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -11325,7 +11325,7 @@ static SDValue emitFloatCompareMask(SDValue LHS, SDValue RHS, SDValue TVal,
SDValue AArch64TargetLowering::LowerSELECT_CC(
ISD::CondCode CC, SDValue LHS, SDValue RHS, SDValue TVal, SDValue FVal,
- iterator_range<SDNode::user_iterator> Users, bool HasNoNaNs,
+ iterator_range<SDNode::user_iterator> Users, SDNodeFlags Flags,
const SDLoc &DL, SelectionDAG &DAG) const {
// Handle f128 first, because it will result in a comparison of some RTLIB
// call result against zero.
@@ -11523,7 +11523,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(
return true;
}
})) {
- bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || HasNoNaNs;
+ bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Flags.hasNoNaNs();
SDValue VectorCmp =
emitFloatCompareMask(LHS, RHS, TVal, FVal, CC, NoNaNs, DL, DAG);
if (VectorCmp)
@@ -11537,7 +11537,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(
AArch64CC::CondCode CC1, CC2;
changeFPCCToAArch64CC(CC, CC1, CC2);
- if (DAG.getTarget().Options.UnsafeFPMath) {
+ if (Flags.hasNoSignedZeros()) {
// Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
// "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
@@ -11616,10 +11616,9 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
SDValue RHS = Op.getOperand(1);
SDValue TVal = Op.getOperand(2);
SDValue FVal = Op.getOperand(3);
- bool HasNoNans = Op->getFlags().hasNoNaNs();
+ SDNodeFlags Flags = Op->getFlags();
SDLoc DL(Op);
- return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), HasNoNans, DL,
- DAG);
+ return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), Flags, DL, DAG);
}
SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
@@ -11627,7 +11626,6 @@ SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
SDValue CCVal = Op->getOperand(0);
SDValue TVal = Op->getOperand(1);
SDValue FVal = Op->getOperand(2);
- bool HasNoNans = Op->getFlags().hasNoNaNs();
SDLoc DL(Op);
EVT Ty = Op.getValueType();
@@ -11694,8 +11692,8 @@ SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
DAG.getUNDEF(MVT::f32), FVal);
}
- SDValue Res =
- LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), HasNoNans, DL, DAG);
+ SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(),
+ Op->getFlags(), DL, DAG);
if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res);
@@ -12292,7 +12290,9 @@ SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
SDLoc DL(Operand);
EVT VT = Operand.getValueType();
- SDNodeFlags Flags = SDNodeFlags::AllowReassociation;
+ // Ensure nodes can be recognized by isAssociativeAndCommutative.
+ SDNodeFlags Flags =
+ SDNodeFlags::AllowReassociation | SDNodeFlags::NoSignedZeros;
// Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
// AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
@@ -16674,7 +16674,7 @@ bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) &&
(Options.AllowFPOpFusion == FPOpFusion::Fast ||
- Options.UnsafeFPMath));
+ I->getFastMathFlags().allowContract()));
}
// All 32-bit GPR operations implicitly zero the high-half of the corresponding
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 95d0e3b..ea63edd8 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -662,7 +662,7 @@ private:
SDValue LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, SDValue RHS,
SDValue TVal, SDValue FVal,
iterator_range<SDNode::user_iterator> Users,
- bool HasNoNans, const SDLoc &dl,
+ SDNodeFlags Flags, const SDLoc &dl,
SelectionDAG &DAG) const;
SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 8685d7a0..59d4fd2 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -6574,10 +6574,8 @@ static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options;
// We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by
// the target options or if FADD/FSUB has the contract fast-math flag.
- return Options.UnsafeFPMath ||
- Options.AllowFPOpFusion == FPOpFusion::Fast ||
+ return Options.AllowFPOpFusion == FPOpFusion::Fast ||
Inst.getFlag(MachineInstr::FmContract);
- return true;
}
return false;
}
@@ -6680,9 +6678,8 @@ bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
case AArch64::FMUL_ZZZ_H:
case AArch64::FMUL_ZZZ_S:
case AArch64::FMUL_ZZZ_D:
- return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath ||
- (Inst.getFlag(MachineInstr::MIFlag::FmReassoc) &&
- Inst.getFlag(MachineInstr::MIFlag::FmNsz));
+ return Inst.getFlag(MachineInstr::MIFlag::FmReassoc) &&
+ Inst.getFlag(MachineInstr::MIFlag::FmNsz);
// == Integer types ==
// -- Base instructions --
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 8b8fc8b..a17fb93 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -286,6 +286,12 @@ def FeatureSafeCUPrefetch : SubtargetFeature<"safe-cu-prefetch",
"VMEM CU scope prefetches do not fail on illegal address"
>;
+def FeatureCUStores : SubtargetFeature<"cu-stores",
+ "HasCUStores",
+ "true",
+ "Whether SCOPE_CU stores can be used on GFX12.5"
+>;
+
def FeatureVcmpxExecWARHazard : SubtargetFeature<"vcmpx-exec-war-hazard",
"HasVcmpxExecWARHazard",
"true",
@@ -1988,6 +1994,7 @@ def FeatureISAVersion12 : FeatureSet<
def FeatureISAVersion12_50 : FeatureSet<
[FeatureGFX12,
FeatureGFX1250Insts,
+ FeatureCUStores,
FeatureCuMode,
Feature64BitLiterals,
FeatureLDSBankCount32,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 4b3dc37..6681393 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -552,6 +552,7 @@ const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
MCContext &Ctx = MF.getContext();
uint16_t KernelCodeProperties = 0;
const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
KernelCodeProperties |=
@@ -581,10 +582,13 @@ const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
KernelCodeProperties |=
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
}
- if (MF.getSubtarget<GCNSubtarget>().isWave32()) {
+ if (ST.isWave32()) {
KernelCodeProperties |=
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
}
+ if (isGFX1250(ST) && ST.hasCUStores()) {
+ KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES;
+ }
// CurrentProgramInfo.DynamicCallStack is a MCExpr and could be
// un-evaluatable at this point so it cannot be conditionally checked here.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 49d8b44..59cc1df 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -13,7 +13,6 @@
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
#include "llvm/Target/TargetMachine.h"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index c01e5d3..992572f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -143,6 +143,9 @@ def gi_global_saddr_cpol :
def gi_global_saddr_glc :
GIComplexOperandMatcher<s64, "selectGlobalSAddrGLC">,
GIComplexPatternEquiv<GlobalSAddrGLC>;
+def gi_global_saddr_no_ioffset :
+ GIComplexOperandMatcher<s64, "selectGlobalSAddrNoIOffset">,
+ GIComplexPatternEquiv<GlobalSAddrNoIOffset>;
def gi_mubuf_scratch_offset :
GIComplexOperandMatcher<s32, "selectMUBUFScratchOffset">,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index dfaa145..3d7e678 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -2049,6 +2049,24 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr,
return true;
}
+bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr,
+ SDValue &SAddr,
+ SDValue &VOffset,
+ SDValue &CPol) const {
+ bool ScaleOffset;
+ SDValue DummyOffset;
+ if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, DummyOffset, ScaleOffset,
+ false))
+ return false;
+
+ // We are assuming CPol is always the last operand of the intrinsic.
+ auto PassedCPol =
+ N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL;
+ CPol = CurDAG->getTargetConstant(
+ (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
+ return true;
+}
+
static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) {
if (auto *FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 5636d89..983f1aa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -174,6 +174,8 @@ private:
bool SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, SDValue &SAddr,
SDValue &VOffset, SDValue &Offset,
SDValue &CPol) const;
+ bool SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr, SDValue &SAddr,
+ SDValue &VOffset, SDValue &CPol) const;
bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
SDValue &Offset) const;
bool checkFlatScratchSVSSwizzleBug(SDValue VAddr, SDValue SAddr,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index e3ca09e..6118933 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -391,8 +391,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
// Library functions. These default to Expand, but we have instructions
// for them.
setOperationAction({ISD::FCEIL, ISD::FPOW, ISD::FABS, ISD::FFLOOR,
- ISD::FROUNDEVEN, ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM},
- MVT::f32, Legal);
+ ISD::FROUNDEVEN, ISD::FTRUNC},
+ {MVT::f16, MVT::f32}, Legal);
+ setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, MVT::f32, Legal);
setOperationAction(ISD::FLOG2, MVT::f32, Custom);
setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
@@ -412,9 +413,10 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom);
- if (Subtarget->has16BitInsts())
+ if (Subtarget->has16BitInsts()) {
setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
- else {
+ setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Legal);
+ } else {
setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Custom);
}
@@ -4844,94 +4846,11 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
return SDValue();
}
-// Detect when CMP and SELECT use the same constant and fold them to avoid
-// loading the constant twice. Specifically handles patterns like:
-// %cmp = icmp eq i32 %val, 4242
-// %sel = select i1 %cmp, i32 4242, i32 %other
-// It can be optimized to reuse %val instead of 4242 in select.
-static SDValue
-foldCmpSelectWithSharedConstant(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
- const AMDGPUSubtarget *ST) {
- SDValue Cond = N->getOperand(0);
- SDValue TrueVal = N->getOperand(1);
- SDValue FalseVal = N->getOperand(2);
-
- // Check if condition is a comparison.
- if (Cond.getOpcode() != ISD::SETCC)
- return SDValue();
-
- SDValue LHS = Cond.getOperand(0);
- SDValue RHS = Cond.getOperand(1);
- ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
-
- bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
- bool isInteger = LHS.getValueType().isInteger();
-
- // Handle simple floating-point and integer types only.
- if (!isFloatingPoint && !isInteger)
- return SDValue();
-
- bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
- bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
- if (!isEquality && !isNonEquality)
- return SDValue();
-
- SDValue ArgVal, ConstVal;
- if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) ||
- (isInteger && isa<ConstantSDNode>(RHS))) {
- ConstVal = RHS;
- ArgVal = LHS;
- } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) ||
- (isInteger && isa<ConstantSDNode>(LHS))) {
- ConstVal = LHS;
- ArgVal = RHS;
- } else {
- return SDValue();
- }
-
- // Check if constant should not be optimized - early return if not.
- if (isFloatingPoint) {
- const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF();
- const GCNSubtarget *GCNST = static_cast<const GCNSubtarget *>(ST);
-
- // Only optimize normal floating-point values (finite, non-zero, and
- // non-subnormal as per IEEE 754), skip optimization for inlinable
- // floating-point constants.
- if (!Val.isNormal() || GCNST->getInstrInfo()->isInlineConstant(Val))
- return SDValue();
- } else {
- int64_t IntVal = cast<ConstantSDNode>(ConstVal)->getSExtValue();
-
- // Skip optimization for inlinable integer immediates.
- // Inlinable immediates include: -16 to 64 (inclusive).
- if (IntVal >= -16 && IntVal <= 64)
- return SDValue();
- }
-
- // For equality and non-equality comparisons, patterns:
- // select (setcc x, const), const, y -> select (setcc x, const), x, y
- // select (setccinv x, const), y, const -> select (setccinv x, const), y, x
- if (!(isEquality && TrueVal == ConstVal) &&
- !(isNonEquality && FalseVal == ConstVal))
- return SDValue();
-
- SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
- SDValue SelectRHS =
- (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
- return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond,
- SelectLHS, SelectRHS);
-}
-
SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
return Folded;
- // Try to fold CMP + SELECT patterns with shared constants (both FP and
- // integer).
- if (SDValue Folded = foldCmpSelectWithSharedConstant(N, DCI, Subtarget))
- return Folded;
-
SDValue Cond = N->getOperand(0);
if (Cond.getOpcode() != ISD::SETCC)
return SDValue();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 266dee1..04773c9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -5789,6 +5789,17 @@ AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const {
}
InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectGlobalSAddrNoIOffset(
+ MachineOperand &Root) const {
+ const MachineInstr &I = *Root.getParent();
+
+ // We are assuming CPol is always the last operand of the intrinsic.
+ auto PassedCPol =
+ I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
+ return selectGlobalSAddr(Root, PassedCPol, false);
+}
+
+InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
Register Addr = Root.getReg();
Register PtrBase;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index fe9743d0a..140e753 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -264,6 +264,8 @@ private:
selectGlobalSAddrCPol(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectGlobalSAddrGLC(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectGlobalSAddrNoIOffset(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectScratchSAddr(MachineOperand &Root) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index c5a1d9e..306443d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -5364,6 +5364,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
break;
}
+ case Intrinsic::amdgcn_global_store_async_from_lds_b8:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b32:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b64:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b128:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b8:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b32:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b64:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b128:
case Intrinsic::amdgcn_load_to_lds:
case Intrinsic::amdgcn_global_load_lds: {
OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index c865082..38f9ee5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -836,8 +836,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
// When we are not using -fgpu-rdc, we can run accelerator code
// selection relatively early, but still after linking to prevent
// eager removal of potentially reachable symbols.
- if (EnableHipStdPar)
+ if (EnableHipStdPar) {
+ PM.addPass(HipStdParMathFixupPass());
PM.addPass(HipStdParAcceleratorCodeSelectionPass());
+ }
PM.addPass(AMDGPUPrintfRuntimeBindingPass());
}
@@ -916,8 +918,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
// selection after linking to prevent, otherwise we end up removing
// potentially reachable symbols that were exported as external in other
// modules.
- if (EnableHipStdPar)
+ if (EnableHipStdPar) {
+ PM.addPass(HipStdParMathFixupPass());
PM.addPass(HipStdParAcceleratorCodeSelectionPass());
+ }
// We want to support the -lto-partitions=N option as "best effort".
// For that, we need to lower LDS earlier in the pipeline before the
// module is partitioned for codegen.
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 421fc42..44e65b3 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -6066,6 +6066,12 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
ExprVal, ValRange);
if (Val)
ImpliedUserSGPRCount += 1;
+ } else if (ID == ".amdhsa_uses_cu_stores") {
+ if (!isGFX1250())
+ return Error(IDRange.Start, "directive requires gfx12.5", IDRange);
+
+ PARSE_BITS_ENTRY(KD.kernel_code_properties,
+ KERNEL_CODE_PROPERTY_USES_CU_STORES, ExprVal, ValRange);
} else if (ID == ".amdhsa_wavefront_size32") {
EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
if (IVersion.Major < 10)
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 5c1989b..ffe6b06 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -2556,6 +2556,9 @@ Expected<bool> AMDGPUDisassembler::decodeKernelDescriptorDirective(
KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_size",
KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE);
+ if (isGFX1250())
+ PRINT_DIRECTIVE(".amdhsa_uses_cu_stores",
+ KERNEL_CODE_PROPERTY_USES_CU_STORES);
if (TwoByteBuffer & KERNEL_CODE_PROPERTY_RESERVED0)
return createReservedKDBitsError(KERNEL_CODE_PROPERTY_RESERVED0,
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 7207c25..8ede9ca 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -11,6 +11,7 @@ let WantsRoot = true in {
def GlobalOffset : ComplexPattern<iPTR, 2, "SelectGlobalOffset", [], [], -10>;
def ScratchOffset : ComplexPattern<iPTR, 2, "SelectScratchOffset", [], [], -10>;
+ def GlobalSAddrNoIOffset : ComplexPattern<iPTR, 3, "SelectGlobalSAddrNoIOffset", [], [], -3>;
def GlobalSAddr : ComplexPattern<iPTR, 4, "SelectGlobalSAddr", [], [], -10>;
def GlobalSAddrGLC : ComplexPattern<iPTR, 4, "SelectGlobalSAddrGLC", [], [], -10>;
def GlobalSAddrCPol : ComplexPattern<iPTR, 4, "SelectGlobalSAddrCPol", [], [], -10>;
@@ -369,31 +370,68 @@ multiclass FLAT_Global_Store_Pseudo_t16<string opName> {
}
}
-class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0> : FLAT_Pseudo<
+// Async loads, introduced in gfx1250, will store directly
+// to a DS address in vdst (they will not use M0 for DS addess).
+class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0, bit IsAsync = 0> : FLAT_Pseudo<
opName,
(outs ),
!con(
- !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)),
- (ins flat_offset:$offset, CPol_0:$cpol)),
- " $vaddr"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> {
- let LGKM_CNT = 1;
+ !if(IsAsync, (ins VGPR_32:$vdst), (ins)),
+ !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)),
+ (ins flat_offset:$offset, CPol_0:$cpol)),
+ !if(IsAsync, " $vdst,", "")#" $vaddr"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> {
+ let LGKM_CNT = !not(IsAsync);
+ let VM_CNT = !not(IsAsync);
+ let ASYNC_CNT = IsAsync;
let is_flat_global = 1;
let lds = 1;
let has_data = 0;
+ let has_vdst = IsAsync; // vdst for ds address with IsAsync
+ let mayLoad = 1;
+ let mayStore = 1;
+ let has_saddr = 1;
+ let enabled_saddr = EnableSaddr;
+ let VALU = 1;
+ let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", "");
+ let Uses = !if(IsAsync, [EXEC, ASYNCcnt], [M0, EXEC]);
+ let Defs = !if(IsAsync, [ASYNCcnt], []);
+ let SchedRW = [WriteVMEM, WriteLDS];
+}
+
+multiclass FLAT_Global_Load_LDS_Pseudo<string opName, bit IsAsync = 0> {
+ def "" : FLAT_Global_Load_LDS_Pseudo<opName, 0, IsAsync>,
+ GlobalSaddrTable<0, opName>;
+ def _SADDR : FLAT_Global_Load_LDS_Pseudo<opName, 1, IsAsync>,
+ GlobalSaddrTable<1, opName>;
+}
+
+class FLAT_Global_STORE_LDS_Pseudo <string opName, bit EnableSaddr = 0> : FLAT_Pseudo<
+ opName,
+ (outs ),
+ !con(
+ !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)), (ins VGPR_32:$vdata),
+ (ins flat_offset:$offset, CPol_0:$cpol)),
+ " $vaddr, $vdata"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> {
+ let VM_CNT = 0;
+ let ASYNC_CNT = 1;
+ let is_flat_global = 1;
+ let lds = 1;
+ let has_data = 1; // vdata for ds address
let has_vdst = 0;
let mayLoad = 1;
let mayStore = 1;
let has_saddr = 1;
let enabled_saddr = EnableSaddr;
let VALU = 1;
- let Uses = [M0, EXEC];
+ let Uses = [EXEC, ASYNCcnt];
+ let Defs = [ASYNCcnt];
let SchedRW = [WriteVMEM, WriteLDS];
}
-multiclass FLAT_Global_Load_LDS_Pseudo<string opName> {
- def "" : FLAT_Global_Load_LDS_Pseudo<opName>,
+multiclass FLAT_Global_STORE_LDS_Pseudo<string opName> {
+ def "" : FLAT_Global_STORE_LDS_Pseudo<opName>,
GlobalSaddrTable<0, opName>;
- def _SADDR : FLAT_Global_Load_LDS_Pseudo<opName, 1>,
+ def _SADDR : FLAT_Global_STORE_LDS_Pseudo<opName, 1>,
GlobalSaddrTable<1, opName>;
}
@@ -1156,6 +1194,15 @@ let SubtargetPredicate = isGFX12Plus in {
let SubtargetPredicate = isGFX1250Plus in {
+defm GLOBAL_LOAD_ASYNC_TO_LDS_B8 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b8", 1>;
+defm GLOBAL_LOAD_ASYNC_TO_LDS_B32 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b32", 1>;
+defm GLOBAL_LOAD_ASYNC_TO_LDS_B64 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b64", 1>;
+defm GLOBAL_LOAD_ASYNC_TO_LDS_B128 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b128", 1>;
+defm GLOBAL_STORE_ASYNC_FROM_LDS_B8 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b8">;
+defm GLOBAL_STORE_ASYNC_FROM_LDS_B32 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b32">;
+defm GLOBAL_STORE_ASYNC_FROM_LDS_B64 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b64">;
+defm GLOBAL_STORE_ASYNC_FROM_LDS_B128 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b128">;
+
def TENSOR_SAVE : FLAT_Global_Tensor_Pseudo<"tensor_save", 1>;
def TENSOR_STOP : FLAT_Global_Tensor_Pseudo<"tensor_stop">;
} // End SubtargetPredicate = isGFX1250Plus
@@ -1315,6 +1362,26 @@ class FlatLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueT
(inst $saddr, $voffset, $offset, $cpol)
>;
+class FlatLoadLDSSignedPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
+ (node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol)),
+ (inst $dsaddr, $vaddr, $offset, $cpol)
+>;
+
+class GlobalLoadLDSSaddrPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
+ (node (GlobalSAddrNoIOffset (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), CPol:$cpol), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm)),
+ (inst $dsaddr, $saddr, $voffset, $offset, $cpol)
+>;
+
+class FlatStoreLDSSignedPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
+ (node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol)),
+ (inst $vaddr, $dsaddr, $offset, $cpol)
+>;
+
+class GlobalStoreLDSSaddrPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
+ (node (GlobalSAddrNoIOffset (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), CPol:$cpol), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm)),
+ (inst $saddr, $voffset, $dsaddr, $offset, $cpol)
+>;
+
class GlobalLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))),
(inst $saddr, $voffset, $offset, $cpol)
@@ -1525,6 +1592,26 @@ class ScratchLoadSVaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Va
(inst $vaddr, $saddr, $offset, $cpol)
>;
+multiclass GlobalLoadLDSPats<FLAT_Pseudo inst, SDPatternOperator node> {
+ def : FlatLoadLDSSignedPat <inst, node> {
+ let AddedComplexity = 10;
+ }
+
+ def : GlobalLoadLDSSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node> {
+ let AddedComplexity = 11;
+ }
+}
+
+multiclass GlobalStoreLDSPats<FLAT_Pseudo inst, SDPatternOperator node> {
+ def : FlatStoreLDSSignedPat <inst, node> {
+ let AddedComplexity = 10;
+ }
+
+ def : GlobalStoreLDSSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node> {
+ let AddedComplexity = 11;
+ }
+}
+
multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
def : FlatLoadSignedPat <inst, node, vt> {
let AddedComplexity = 10;
@@ -2091,6 +2178,18 @@ let OtherPredicates = [isGFX125xOnly] in {
defm : GlobalFLATLoadPats_CPOL <GLOBAL_LOAD_MONITOR_B128, int_amdgcn_global_load_monitor_b128, v4i32>;
} // End SubtargetPredicate = isGFX125xOnly
+let OtherPredicates = [isGFX1250Plus] in {
+ defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B8, int_amdgcn_global_load_async_to_lds_b8>;
+ defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B32, int_amdgcn_global_load_async_to_lds_b32>;
+ defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B64, int_amdgcn_global_load_async_to_lds_b64>;
+ defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B128, int_amdgcn_global_load_async_to_lds_b128>;
+
+ defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B8, int_amdgcn_global_store_async_from_lds_b8>;
+ defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B32, int_amdgcn_global_store_async_from_lds_b32>;
+ defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B64, int_amdgcn_global_store_async_from_lds_b64>;
+ defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B128, int_amdgcn_global_store_async_from_lds_b128>;
+}
+
let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in {
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>;
@@ -3374,6 +3473,15 @@ defm GLOBAL_LOAD_MONITOR_B32 : VFLAT_Real_AllAddr_gfx1250<0x070>;
defm GLOBAL_LOAD_MONITOR_B64 : VFLAT_Real_AllAddr_gfx1250<0x071>;
defm GLOBAL_LOAD_MONITOR_B128 : VFLAT_Real_AllAddr_gfx1250<0x072>;
+defm GLOBAL_LOAD_ASYNC_TO_LDS_B8 : VFLAT_Real_AllAddr_gfx1250<0x5f>;
+defm GLOBAL_LOAD_ASYNC_TO_LDS_B32 : VFLAT_Real_AllAddr_gfx1250<0x60>;
+defm GLOBAL_LOAD_ASYNC_TO_LDS_B64 : VFLAT_Real_AllAddr_gfx1250<0x61>;
+defm GLOBAL_LOAD_ASYNC_TO_LDS_B128 : VFLAT_Real_AllAddr_gfx1250<0x62>;
+defm GLOBAL_STORE_ASYNC_FROM_LDS_B8 : VFLAT_Real_AllAddr_gfx1250<0x63>;
+defm GLOBAL_STORE_ASYNC_FROM_LDS_B32 : VFLAT_Real_AllAddr_gfx1250<0x64>;
+defm GLOBAL_STORE_ASYNC_FROM_LDS_B64 : VFLAT_Real_AllAddr_gfx1250<0x65>;
+defm GLOBAL_STORE_ASYNC_FROM_LDS_B128 : VFLAT_Real_AllAddr_gfx1250<0x66>;
+
defm GLOBAL_LOAD_TR_B128_w32 : VFLAT_Real_AllAddr_gfx1250<0x057, "global_load_tr16_b128">;
defm GLOBAL_LOAD_TR_B64_w32 : VFLAT_Real_AllAddr_gfx1250<0x058, "global_load_tr8_b64">;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 9a2bab1..0a0a107 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -537,6 +537,63 @@ unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
return getMaxNumVGPRs(MF.getFunction());
}
+std::pair<unsigned, unsigned>
+GCNSubtarget::getMaxNumVectorRegs(const Function &F) const {
+ const unsigned MaxVectorRegs = getMaxNumVGPRs(F);
+
+ unsigned MaxNumVGPRs = MaxVectorRegs;
+ unsigned MaxNumAGPRs = 0;
+
+ // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically,
+ // a wave may have up to 512 total vector registers combining together both
+ // VGPRs and AGPRs. Hence, in an entry function without calls and without
+ // AGPRs used within it, it is possible to use the whole vector register
+ // budget for VGPRs.
+ //
+ // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split
+ // register file accordingly.
+ if (hasGFX90AInsts()) {
+ unsigned MinNumAGPRs = 0;
+ const unsigned TotalNumAGPRs = AMDGPU::AGPR_32RegClass.getNumRegs();
+ const unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
+
+ const std::pair<unsigned, unsigned> DefaultNumAGPR = {~0u, ~0u};
+
+ // TODO: The lower bound should probably force the number of required
+ // registers up, overriding amdgpu-waves-per-eu.
+ std::tie(MinNumAGPRs, MaxNumAGPRs) =
+ AMDGPU::getIntegerPairAttribute(F, "amdgpu-agpr-alloc", DefaultNumAGPR,
+ /*OnlyFirstRequired=*/true);
+
+ if (MinNumAGPRs == DefaultNumAGPR.first) {
+ // Default to splitting half the registers if AGPRs are required.
+ MinNumAGPRs = MaxNumAGPRs = MaxVectorRegs / 2;
+ } else {
+ // Align to accum_offset's allocation granularity.
+ MinNumAGPRs = alignTo(MinNumAGPRs, 4);
+
+ MinNumAGPRs = std::min(MinNumAGPRs, TotalNumAGPRs);
+ }
+
+ // Clamp values to be inbounds of our limits, and ensure min <= max.
+
+ MaxNumAGPRs = std::min(std::max(MinNumAGPRs, MaxNumAGPRs), MaxVectorRegs);
+ MinNumAGPRs = std::min(std::min(MinNumAGPRs, TotalNumAGPRs), MaxNumAGPRs);
+
+ MaxNumVGPRs = std::min(MaxVectorRegs - MinNumAGPRs, TotalNumVGPRs);
+ MaxNumAGPRs = std::min(MaxVectorRegs - MaxNumVGPRs, MaxNumAGPRs);
+
+ assert(MaxNumVGPRs + MaxNumAGPRs <= MaxVectorRegs &&
+ MaxNumAGPRs <= TotalNumAGPRs && MaxNumVGPRs <= TotalNumVGPRs &&
+ "invalid register counts");
+ } else if (hasMAIInsts()) {
+ // On gfx908 the number of AGPRs always equals the number of VGPRs.
+ MaxNumAGPRs = MaxNumVGPRs = MaxVectorRegs;
+ }
+
+ return std::pair(MaxNumVGPRs, MaxNumAGPRs);
+}
+
void GCNSubtarget::adjustSchedDependency(
SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep,
const TargetSchedModel *SchedModel) const {
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 88a269f..10ded0e 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -248,6 +248,7 @@ protected:
bool HasVmemPrefInsts = false;
bool HasSafeSmemPrefetch = false;
bool HasSafeCUPrefetch = false;
+ bool HasCUStores = false;
bool HasVcmpxExecWARHazard = false;
bool HasLdsBranchVmemWARHazard = false;
bool HasNSAtoVMEMBug = false;
@@ -998,6 +999,8 @@ public:
bool hasSafeCUPrefetch() const { return HasSafeCUPrefetch; }
+ bool hasCUStores() const { return HasCUStores; }
+
// Has s_cmpk_* instructions.
bool hasSCmpK() const { return getGeneration() < GFX12; }
@@ -1667,6 +1670,10 @@ public:
return getMaxNumVGPRs(F);
}
+ /// Return a pair of maximum numbers of VGPRs and AGPRs that meet the number
+ /// of waves per execution unit required for the function \p MF.
+ std::pair<unsigned, unsigned> getMaxNumVectorRegs(const Function &F) const;
+
/// \returns Maximum number of VGPRs that meets number of waves per execution
/// unit requirement for function \p MF, or number of VGPRs explicitly
/// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 10f6d33..43ca548 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -440,6 +440,11 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT,
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE,
".amdhsa_user_sgpr_private_segment_size");
+ if (isGFX1250(STI))
+ PrintField(KD.kernel_code_properties,
+ amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES_SHIFT,
+ amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES,
+ ".amdhsa_uses_cu_stores");
if (IVersion.Major >= 10)
PrintField(KD.kernel_code_properties,
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_SHIFT,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 8d51ec6..fbaf9bc 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1256,6 +1256,25 @@ MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const {
return AMDGPUTargetLowering::getPointerMemTy(DL, AS);
}
+static unsigned getIntrMemWidth(unsigned IntrID) {
+ switch (IntrID) {
+ case Intrinsic::amdgcn_global_load_async_to_lds_b8:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b8:
+ return 8;
+ case Intrinsic::amdgcn_global_load_async_to_lds_b32:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b32:
+ return 32;
+ case Intrinsic::amdgcn_global_load_async_to_lds_b64:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b64:
+ return 64;
+ case Intrinsic::amdgcn_global_load_async_to_lds_b128:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b128:
+ return 128;
+ default:
+ llvm_unreachable("Unknown width");
+ }
+}
+
bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &CI,
MachineFunction &MF,
@@ -1527,6 +1546,26 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags |= MachineMemOperand::MOStore;
return true;
}
+ case Intrinsic::amdgcn_global_load_async_to_lds_b8:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b32:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b64:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b128: {
+ Info.opc = ISD::INTRINSIC_VOID;
+ Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
+ Info.ptrVal = CI.getArgOperand(1);
+ Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+ return true;
+ }
+ case Intrinsic::amdgcn_global_store_async_from_lds_b8:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b32:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b64:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
+ Info.opc = ISD::INTRINSIC_VOID;
+ Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
+ Info.ptrVal = CI.getArgOperand(0);
+ Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+ return true;
+ }
case Intrinsic::amdgcn_load_to_lds:
case Intrinsic::amdgcn_global_load_lds: {
Info.opc = ISD::INTRINSIC_VOID;
@@ -1623,10 +1662,18 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
case Intrinsic::amdgcn_global_load_tr_b128:
case Intrinsic::amdgcn_global_load_tr4_b64:
case Intrinsic::amdgcn_global_load_tr6_b96:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b8:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b32:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b64:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b128:
Ptr = II->getArgOperand(0);
break;
case Intrinsic::amdgcn_load_to_lds:
case Intrinsic::amdgcn_global_load_lds:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b8:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b32:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b64:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b128:
Ptr = II->getArgOperand(1);
break;
default:
@@ -15896,6 +15943,78 @@ SDValue SITargetLowering::performClampCombine(SDNode *N,
return SDValue(CSrc, 0);
}
+SDValue SITargetLowering::performSelectCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+
+ // Try to fold CMP + SELECT patterns with shared constants (both FP and
+ // integer).
+ // Detect when CMP and SELECT use the same constant and fold them to avoid
+ // loading the constant twice. Specifically handles patterns like:
+ // %cmp = icmp eq i32 %val, 4242
+ // %sel = select i1 %cmp, i32 4242, i32 %other
+ // It can be optimized to reuse %val instead of 4242 in select.
+ SDValue Cond = N->getOperand(0);
+ SDValue TrueVal = N->getOperand(1);
+ SDValue FalseVal = N->getOperand(2);
+
+ // Check if condition is a comparison.
+ if (Cond.getOpcode() != ISD::SETCC)
+ return SDValue();
+
+ SDValue LHS = Cond.getOperand(0);
+ SDValue RHS = Cond.getOperand(1);
+ ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+ bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
+ bool isInteger = LHS.getValueType().isInteger();
+
+ // Handle simple floating-point and integer types only.
+ if (!isFloatingPoint && !isInteger)
+ return SDValue();
+
+ bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
+ bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
+ if (!isEquality && !isNonEquality)
+ return SDValue();
+
+ SDValue ArgVal, ConstVal;
+ if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) ||
+ (isInteger && isa<ConstantSDNode>(RHS))) {
+ ConstVal = RHS;
+ ArgVal = LHS;
+ } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) ||
+ (isInteger && isa<ConstantSDNode>(LHS))) {
+ ConstVal = LHS;
+ ArgVal = RHS;
+ } else {
+ return SDValue();
+ }
+
+ // Skip optimization for inlinable immediates.
+ if (isFloatingPoint) {
+ const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF();
+ if (!Val.isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
+ return SDValue();
+ } else {
+ if (AMDGPU::isInlinableIntLiteral(
+ cast<ConstantSDNode>(ConstVal)->getSExtValue()))
+ return SDValue();
+ }
+
+ // For equality and non-equality comparisons, patterns:
+ // select (setcc x, const), const, y -> select (setcc x, const), x, y
+ // select (setccinv x, const), y, const -> select (setccinv x, const), y, x
+ if (!(isEquality && TrueVal == ConstVal) &&
+ !(isNonEquality && FalseVal == ConstVal))
+ return SDValue();
+
+ SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
+ SDValue SelectRHS =
+ (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
+ return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond,
+ SelectLHS, SelectRHS);
+}
+
SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
switch (N->getOpcode()) {
@@ -15944,6 +16063,10 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return performFMulCombine(N, DCI);
case ISD::SETCC:
return performSetCCCombine(N, DCI);
+ case ISD::SELECT:
+ if (auto Res = performSelectCombine(N, DCI))
+ return Res;
+ break;
case ISD::FMAXNUM:
case ISD::FMINNUM:
case ISD::FMAXNUM_IEEE:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index acf6158..dedd9ae 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -211,6 +211,7 @@ private:
SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performInsertVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFPRoundCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue reassociateScalarOps(SDNode *N, SelectionDAG &DAG) const;
unsigned getFusedOpcode(const SelectionDAG &DAG,
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index dd3f2fe..520c321 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -552,7 +552,7 @@ public:
(!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) {
// FLAT and SCRATCH instructions may access scratch. Other VMEM
// instructions do not.
- if (SIInstrInfo::isFLAT(Inst) && mayAccessScratchThroughFlat(Inst))
+ if (TII->mayAccessScratchThroughFlat(Inst))
return SCRATCH_WRITE_ACCESS;
return VMEM_WRITE_ACCESS;
}
@@ -565,7 +565,6 @@ public:
bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
- bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
bool isVmemAccess(const MachineInstr &MI) const;
bool generateWaitcntInstBefore(MachineInstr &MI,
WaitcntBrackets &ScoreBrackets,
@@ -2160,32 +2159,6 @@ bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
return false;
}
-// This is a flat memory operation. Check to see if it has memory tokens for
-// either scratch or FLAT.
-bool SIInsertWaitcnts::mayAccessScratchThroughFlat(
- const MachineInstr &MI) const {
- assert(TII->isFLAT(MI));
-
- // SCRATCH instructions always access scratch.
- if (TII->isFLATScratch(MI))
- return true;
-
- // GLOBAL instructions never access scratch.
- if (TII->isFLATGlobal(MI))
- return false;
-
- // If there are no memory operands then conservatively assume the flat
- // operation may access scratch.
- if (MI.memoperands_empty())
- return true;
-
- // See if any memory operand specifies an address space that involves scratch.
- return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
- unsigned AS = Memop->getAddrSpace();
- return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
- });
-}
-
bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
return (TII->isFLAT(MI) && mayAccessVMEMThroughFlat(MI)) ||
(TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 8d6c1d0..2aa6b4e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4249,6 +4249,32 @@ bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const {
Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
}
+bool SIInstrInfo::mayAccessScratchThroughFlat(const MachineInstr &MI) const {
+ if (!isFLAT(MI) || isFLATGlobal(MI))
+ return false;
+
+ // If scratch is not initialized, we can never access it.
+ if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init"))
+ return false;
+
+ // SCRATCH instructions always access scratch.
+ if (isFLATScratch(MI))
+ return true;
+
+ // If there are no memory operands then conservatively assume the flat
+ // operation may access scratch.
+ if (MI.memoperands_empty())
+ return true;
+
+ // TODO (?): Does this need to be taught how to read noalias.addrspace ?
+
+ // See if any memory operand specifies an address space that involves scratch.
+ return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
+ unsigned AS = Memop->getAddrSpace();
+ return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
+ });
+}
+
bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) {
// Skip the full operand and register alias search modifiesRegister
// does. There's only a handful of instructions that touch this, it's only an
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 2ffb783..e042b59 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -678,6 +678,12 @@ public:
return get(Opcode).TSFlags & SIInstrFlags::FLAT;
}
+ /// \returns true for SCRATCH_ instructions, or FLAT_ instructions with
+ /// SCRATCH_ memory operands.
+ /// Conservatively correct; will return true if \p MI cannot be proven
+ /// to not hit scratch.
+ bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
+
static bool isBlockLoadStore(uint16_t Opcode) {
switch (Opcode) {
case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE:
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 9f61bf8..9509199 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -351,6 +351,7 @@ void SILowerSGPRSpills::determineRegsForWWMAllocation(MachineFunction &MF,
MachineRegisterInfo &MRI = MF.getRegInfo();
BitVector ReservedRegs = TRI->getReservedRegs(MF);
BitVector NonWwmAllocMask(TRI->getNumRegs());
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
// FIXME: MaxNumVGPRsForWwmAllocation might need to be adjusted in the future
// to have a balanced allocation between WWM values and per-thread vector
@@ -359,7 +360,7 @@ void SILowerSGPRSpills::determineRegsForWWMAllocation(MachineFunction &MF,
NumRegs =
std::min(static_cast<unsigned>(MFI->getSGPRSpillVGPRs().size()), NumRegs);
- auto [MaxNumVGPRs, MaxNumAGPRs] = TRI->getMaxNumVectorRegs(MF);
+ auto [MaxNumVGPRs, MaxNumAGPRs] = ST.getMaxNumVectorRegs(MF.getFunction());
// Try to use the highest available registers for now. Later after
// vgpr-regalloc, they can be shifted to the lowest range.
unsigned I = 0;
@@ -376,7 +377,7 @@ void SILowerSGPRSpills::determineRegsForWWMAllocation(MachineFunction &MF,
// Reserve an arbitrary register and report the error.
TRI->markSuperRegs(RegMask, AMDGPU::VGPR0);
MF.getFunction().getContext().emitError(
- "can't find enough VGPRs for wwm-regalloc");
+ "cannot find enough VGPRs for wwm-regalloc");
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 0e8a420..025731a 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -321,7 +321,7 @@ public:
bool IsNonTemporal,
bool IsLastUse = false) const = 0;
- virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const {
+ virtual bool finalizeStore(MachineInstr &MI, bool Atomic) const {
return false;
};
@@ -602,7 +602,7 @@ public:
bool IsVolatile, bool IsNonTemporal,
bool IsLastUse) const override;
- bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override;
+ bool finalizeStore(MachineInstr &MI, bool Atomic) const override;
bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
@@ -2536,9 +2536,6 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
if (IsVolatile) {
Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
- if (Op == SIMemOp::STORE)
- Changed |= insertWaitsBeforeSystemScopeStore(MI);
-
// Ensure operation has completed at system scope to cause all volatile
// operations to be visible outside the program in a global order. Do not
// request cross address space as only the global address space can be
@@ -2551,11 +2548,26 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
return Changed;
}
-bool SIGfx12CacheControl::expandSystemScopeStore(
- MachineBasicBlock::iterator &MI) const {
- MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
- if (CPol && ((CPol->getImm() & CPol::SCOPE) == CPol::SCOPE_SYS))
- return insertWaitsBeforeSystemScopeStore(MI);
+bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
+ MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
+ if (!CPol)
+ return false;
+
+ const unsigned Scope = CPol->getImm() & CPol::SCOPE;
+
+ // GFX12.0 only: Extra waits needed before system scope stores.
+ if (!ST.hasGFX1250Insts()) {
+ if (!Atomic && Scope == CPol::SCOPE_SYS)
+ return insertWaitsBeforeSystemScopeStore(MI);
+ return false;
+ }
+
+ // GFX12.5 only: Require SCOPE_SE on stores that may hit the scratch address
+ // space.
+ // We also require SCOPE_SE minimum if we not have the "cu-stores" feature.
+ if (Scope == CPol::SCOPE_CU &&
+ (!ST.hasCUStores() || TII->mayAccessScratchThroughFlat(MI)))
+ return setScope(MI, CPol::SCOPE_SE);
return false;
}
@@ -2658,6 +2670,8 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
assert(!MI->mayLoad() && MI->mayStore());
bool Changed = false;
+ // FIXME: Necessary hack because iterator can lose track of the store.
+ MachineInstr &StoreMI = *MI;
if (MOI.isAtomic()) {
if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
@@ -2674,6 +2688,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
MOI.getIsCrossAddressSpaceOrdering(),
Position::BEFORE);
+ Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/true);
return Changed;
}
@@ -2686,7 +2701,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
// GFX12 specific, scope(desired coherence domain in cache hierarchy) is
// instruction field, do not confuse it with atomic scope.
- Changed |= CC->expandSystemScopeStore(MI);
+ Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/false);
return Changed;
}
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 84cfa87..f3acc5c 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -572,65 +572,6 @@ MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg(
return getAlignedHighSGPRForRC(MF, /*Align=*/4, &AMDGPU::SGPR_128RegClass);
}
-std::pair<unsigned, unsigned>
-SIRegisterInfo::getMaxNumVectorRegs(const MachineFunction &MF) const {
- const unsigned MaxVectorRegs = ST.getMaxNumVGPRs(MF);
-
- unsigned MaxNumVGPRs = MaxVectorRegs;
- unsigned MaxNumAGPRs = 0;
-
- // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically,
- // a wave may have up to 512 total vector registers combining together both
- // VGPRs and AGPRs. Hence, in an entry function without calls and without
- // AGPRs used within it, it is possible to use the whole vector register
- // budget for VGPRs.
- //
- // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split
- // register file accordingly.
- if (ST.hasGFX90AInsts()) {
- unsigned MinNumAGPRs = 0;
- const unsigned TotalNumAGPRs = AMDGPU::AGPR_32RegClass.getNumRegs();
- const unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
-
- const std::pair<unsigned, unsigned> DefaultNumAGPR = {~0u, ~0u};
-
- // TODO: Move this logic into subtarget on IR function
- //
- // TODO: The lower bound should probably force the number of required
- // registers up, overriding amdgpu-waves-per-eu.
- std::tie(MinNumAGPRs, MaxNumAGPRs) = AMDGPU::getIntegerPairAttribute(
- MF.getFunction(), "amdgpu-agpr-alloc", DefaultNumAGPR,
- /*OnlyFirstRequired=*/true);
-
- if (MinNumAGPRs == DefaultNumAGPR.first) {
- // Default to splitting half the registers if AGPRs are required.
- MinNumAGPRs = MaxNumAGPRs = MaxVectorRegs / 2;
- } else {
- // Align to accum_offset's allocation granularity.
- MinNumAGPRs = alignTo(MinNumAGPRs, 4);
-
- MinNumAGPRs = std::min(MinNumAGPRs, TotalNumAGPRs);
- }
-
- // Clamp values to be inbounds of our limits, and ensure min <= max.
-
- MaxNumAGPRs = std::min(std::max(MinNumAGPRs, MaxNumAGPRs), MaxVectorRegs);
- MinNumAGPRs = std::min(std::min(MinNumAGPRs, TotalNumAGPRs), MaxNumAGPRs);
-
- MaxNumVGPRs = std::min(MaxVectorRegs - MinNumAGPRs, TotalNumVGPRs);
- MaxNumAGPRs = std::min(MaxVectorRegs - MaxNumVGPRs, MaxNumAGPRs);
-
- assert(MaxNumVGPRs + MaxNumAGPRs <= MaxVectorRegs &&
- MaxNumAGPRs <= TotalNumAGPRs && MaxNumVGPRs <= TotalNumVGPRs &&
- "invalid register counts");
- } else if (ST.hasMAIInsts()) {
- // On gfx908 the number of AGPRs always equals the number of VGPRs.
- MaxNumAGPRs = MaxNumVGPRs = MaxVectorRegs;
- }
-
- return std::pair(MaxNumVGPRs, MaxNumAGPRs);
-}
-
BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
BitVector Reserved(getNumRegs());
Reserved.set(AMDGPU::MODE);
@@ -742,7 +683,7 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
// Reserve VGPRs/AGPRs.
//
- auto [MaxNumVGPRs, MaxNumAGPRs] = getMaxNumVectorRegs(MF);
+ auto [MaxNumVGPRs, MaxNumAGPRs] = ST.getMaxNumVectorRegs(MF.getFunction());
for (const TargetRegisterClass *RC : regclasses()) {
if (RC->isBaseClass() && isVGPRClass(RC)) {
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 0008e5f..5508f07 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -90,11 +90,6 @@ public:
/// spilling is needed.
MCRegister reservedPrivateSegmentBufferReg(const MachineFunction &MF) const;
- /// Return a pair of maximum numbers of VGPRs and AGPRs that meet the number
- /// of waves per execution unit required for the function \p MF.
- std::pair<unsigned, unsigned>
- getMaxNumVectorRegs(const MachineFunction &MF) const;
-
BitVector getReservedRegs(const MachineFunction &MF) const override;
bool isAsmClobberable(const MachineFunction &MF,
MCRegister PhysReg) const override;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index b5b3cc9..83e63ac 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -732,7 +732,14 @@ bool isGenericAtomic(unsigned Opc) {
}
bool isAsyncStore(unsigned Opc) {
- return false; // placeholder before async store implementation.
+ return Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B8_gfx1250 ||
+ Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B32_gfx1250 ||
+ Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B64_gfx1250 ||
+ Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B128_gfx1250 ||
+ Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B8_SADDR_gfx1250 ||
+ Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B32_SADDR_gfx1250 ||
+ Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B64_SADDR_gfx1250 ||
+ Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B128_SADDR_gfx1250;
}
bool isTensorStore(unsigned Opc) {
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 50217c3..9e4dbec 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -4261,8 +4261,7 @@ std::optional<unsigned> ARMBaseInstrInfo::getOperandLatencyImpl(
// instructions).
if (Latency > 0 && Subtarget.isThumb2()) {
const MachineFunction *MF = DefMI.getParent()->getParent();
- // FIXME: Use Function::hasOptSize().
- if (MF->getFunction().hasFnAttribute(Attribute::OptimizeForSize))
+ if (MF->getFunction().hasOptSize())
--Latency;
}
return Latency;
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index fca5dff..066b392 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -370,6 +370,11 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
setOperationAction(ISD::FMINNUM, VT, Legal);
setOperationAction(ISD::FMAXNUM, VT, Legal);
setOperationAction(ISD::FROUND, VT, Legal);
+ setOperationAction(ISD::FROUNDEVEN, VT, Legal);
+ setOperationAction(ISD::FRINT, VT, Legal);
+ setOperationAction(ISD::FTRUNC, VT, Legal);
+ setOperationAction(ISD::FFLOOR, VT, Legal);
+ setOperationAction(ISD::FCEIL, VT, Legal);
setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom);
setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
@@ -1507,6 +1512,12 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
setOperationAction(ISD::FLOG2, MVT::f16, Promote);
setOperationAction(ISD::FROUND, MVT::f16, Legal);
+ setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);
+ setOperationAction(ISD::FTRUNC, MVT::f16, Legal);
+ setOperationAction(ISD::FNEARBYINT, MVT::f16, Legal);
+ setOperationAction(ISD::FRINT, MVT::f16, Legal);
+ setOperationAction(ISD::FFLOOR, MVT::f16, Legal);
+ setOperationAction(ISD::FCEIL, MVT::f16, Legal);
}
if (Subtarget->hasNEON()) {
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index 868556b..6dfe846 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -1284,14 +1284,11 @@ void ARMELFStreamer::emitCantUnwind() { CantUnwind = true; }
// Add the R_ARM_NONE fixup at the same position
void ARMELFStreamer::EmitPersonalityFixup(StringRef Name) {
const MCSymbol *PersonalitySym = getContext().getOrCreateSymbol(Name);
+ visitUsedSymbol(*PersonalitySym);
const MCSymbolRefExpr *PersonalityRef =
MCSymbolRefExpr::create(PersonalitySym, ARM::S_ARM_NONE, getContext());
-
- visitUsedExpr(*PersonalityRef);
- MCFragment *DF = getCurrentFragment();
- DF->addFixup(
- MCFixup::create(DF->getContents().size(), PersonalityRef, FK_Data_4));
+ addFixup(PersonalityRef, FK_Data_4);
}
void ARMELFStreamer::FlushPendingOffset() {
diff --git a/llvm/lib/Target/BPF/BTFDebug.cpp b/llvm/lib/Target/BPF/BTFDebug.cpp
index a87b9a2..bed6bc9 100644
--- a/llvm/lib/Target/BPF/BTFDebug.cpp
+++ b/llvm/lib/Target/BPF/BTFDebug.cpp
@@ -957,47 +957,47 @@ void BTFDebug::visitMapDefType(const DIType *Ty, uint32_t &TypeId) {
return;
}
- // MapDef type may be a struct type or a non-pointer derived type
- const DIType *OrigTy = Ty;
- while (auto *DTy = dyn_cast<DIDerivedType>(Ty)) {
- auto Tag = DTy->getTag();
- if (Tag != dwarf::DW_TAG_typedef && Tag != dwarf::DW_TAG_const_type &&
- Tag != dwarf::DW_TAG_volatile_type &&
- Tag != dwarf::DW_TAG_restrict_type)
- break;
- Ty = DTy->getBaseType();
- }
-
- const auto *CTy = dyn_cast<DICompositeType>(Ty);
- if (!CTy)
- return;
-
- auto Tag = CTy->getTag();
- if (Tag != dwarf::DW_TAG_structure_type || CTy->isForwardDecl())
- return;
-
- // Visit all struct members to ensure their types are visited.
- const DINodeArray Elements = CTy->getElements();
- for (const auto *Element : Elements) {
- const auto *MemberType = cast<DIDerivedType>(Element);
- const DIType *MemberBaseType = MemberType->getBaseType();
-
- // If the member is a composite type, that may indicate the currently
- // visited composite type is a wrapper, and the member represents the
- // actual map definition.
- // In that case, visit the member with `visitMapDefType` instead of
- // `visitTypeEntry`, treating it specifically as a map definition rather
- // than as a regular composite type.
- const auto *MemberCTy = dyn_cast<DICompositeType>(MemberBaseType);
- if (MemberCTy) {
- visitMapDefType(MemberBaseType, TypeId);
- } else {
- visitTypeEntry(MemberBaseType);
+ uint32_t TmpId;
+ switch (Ty->getTag()) {
+ case dwarf::DW_TAG_typedef:
+ case dwarf::DW_TAG_const_type:
+ case dwarf::DW_TAG_volatile_type:
+ case dwarf::DW_TAG_restrict_type:
+ case dwarf::DW_TAG_pointer_type:
+ visitMapDefType(dyn_cast<DIDerivedType>(Ty)->getBaseType(), TmpId);
+ break;
+ case dwarf::DW_TAG_array_type:
+ // Visit nested map array and jump to the element type
+ visitMapDefType(dyn_cast<DICompositeType>(Ty)->getBaseType(), TmpId);
+ break;
+ case dwarf::DW_TAG_structure_type: {
+ // Visit all struct members to ensure their types are visited.
+ const auto *CTy = cast<DICompositeType>(Ty);
+ const DINodeArray Elements = CTy->getElements();
+ for (const auto *Element : Elements) {
+ const auto *MemberType = cast<DIDerivedType>(Element);
+ const DIType *MemberBaseType = MemberType->getBaseType();
+ // If the member is a composite type, that may indicate the currently
+ // visited composite type is a wrapper, and the member represents the
+ // actual map definition.
+ // In that case, visit the member with `visitMapDefType` instead of
+ // `visitTypeEntry`, treating it specifically as a map definition rather
+ // than as a regular composite type.
+ const auto *MemberCTy = dyn_cast<DICompositeType>(MemberBaseType);
+ if (MemberCTy) {
+ visitMapDefType(MemberBaseType, TmpId);
+ } else {
+ visitTypeEntry(MemberBaseType);
+ }
}
+ break;
+ }
+ default:
+ break;
}
// Visit this type, struct or a const/typedef/volatile/restrict type
- visitTypeEntry(OrigTy, TypeId, false, false);
+ visitTypeEntry(Ty, TypeId, false, false);
}
/// Read file contents from the actual file or from the source
diff --git a/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp b/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
index c86fa2b..54c3cea 100644
--- a/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
@@ -457,7 +457,7 @@ bool HexagonCopyToCombine::runOnMachineFunction(MachineFunction &MF) {
TII = ST->getInstrInfo();
const Function &F = MF.getFunction();
- bool OptForSize = F.hasFnAttribute(Attribute::OptimizeForSize);
+ bool OptForSize = F.hasOptSize();
// Combine aggressively (for code size)
ShouldCombineAggressively =
diff --git a/llvm/lib/Target/Hexagon/HexagonMask.cpp b/llvm/lib/Target/Hexagon/HexagonMask.cpp
index 6eccf80..9d7776d 100644
--- a/llvm/lib/Target/Hexagon/HexagonMask.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonMask.cpp
@@ -76,7 +76,7 @@ bool HexagonMask::runOnMachineFunction(MachineFunction &MF) {
HII = HST.getInstrInfo();
const Function &F = MF.getFunction();
- if (!F.hasFnAttribute(Attribute::OptimizeForSize))
+ if (!F.hasOptSize())
return false;
// Mask instruction is available only from v66
if (!HST.hasV66Ops())
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 613cfb5..d96136c 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -2385,13 +2385,6 @@ SDValue LoongArchTargetLowering::lowerBF16_TO_FP(SDValue Op,
return Res;
}
-static bool isConstantBUILD_VECTOR(const BuildVectorSDNode *Op) {
- for (unsigned i = 0; i < Op->getNumOperands(); ++i)
- if (isIntOrFPConstant(Op->getOperand(i)))
- return true;
- return false;
-}
-
// Lower BUILD_VECTOR as broadcast load (if possible).
// For example:
// %a = load i8, ptr %ptr
@@ -2441,10 +2434,14 @@ SDValue LoongArchTargetLowering::lowerBUILD_VECTOR(SDValue Op,
SelectionDAG &DAG) const {
BuildVectorSDNode *Node = cast<BuildVectorSDNode>(Op);
EVT ResTy = Op->getValueType(0);
+ unsigned NumElts = ResTy.getVectorNumElements();
SDLoc DL(Op);
APInt SplatValue, SplatUndef;
unsigned SplatBitSize;
bool HasAnyUndefs;
+ bool IsConstant = false;
+ bool UseSameConstant = true;
+ SDValue ConstantValue;
bool Is128Vec = ResTy.is128BitVector();
bool Is256Vec = ResTy.is256BitVector();
@@ -2495,13 +2492,35 @@ SDValue LoongArchTargetLowering::lowerBUILD_VECTOR(SDValue Op,
if (DAG.isSplatValue(Op, /*AllowUndefs=*/false))
return Op;
- if (!isConstantBUILD_VECTOR(Node)) {
+ for (unsigned i = 0; i < NumElts; ++i) {
+ SDValue Opi = Node->getOperand(i);
+ if (isIntOrFPConstant(Opi)) {
+ IsConstant = true;
+ if (!ConstantValue.getNode())
+ ConstantValue = Opi;
+ else if (ConstantValue != Opi)
+ UseSameConstant = false;
+ }
+ }
+
+ // If the type of BUILD_VECTOR is v2f64, custom legalizing it has no benefits.
+ if (IsConstant && UseSameConstant && ResTy != MVT::v2f64) {
+ SDValue Result = DAG.getSplatBuildVector(ResTy, DL, ConstantValue);
+ for (unsigned i = 0; i < NumElts; ++i) {
+ SDValue Opi = Node->getOperand(i);
+ if (!isIntOrFPConstant(Opi))
+ Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ResTy, Result, Opi,
+ DAG.getConstant(i, DL, Subtarget.getGRLenVT()));
+ }
+ return Result;
+ }
+
+ if (!IsConstant) {
// Use INSERT_VECTOR_ELT operations rather than expand to stores.
// The resulting code is the same length as the expansion, but it doesn't
// use memory operations.
assert(ResTy.isVector());
- unsigned NumElts = ResTy.getVectorNumElements();
SDValue Op0 = Node->getOperand(0);
SDValue Vector = DAG.getUNDEF(ResTy);
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index d9680c7..7a8395a 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -1034,12 +1034,14 @@ MCELFStreamer &MipsTargetELFStreamer::getStreamer() {
void MipsTargetELFStreamer::emitGPRel32Value(const MCExpr *Value) {
auto &S = getStreamer();
+ S.ensureHeadroom(4);
S.addFixup(Value, Mips::fixup_Mips_GPREL32);
S.appendContents(4, 0);
}
void MipsTargetELFStreamer::emitGPRel64Value(const MCExpr *Value) {
auto &S = getStreamer();
+ S.ensureHeadroom(8);
// fixup_Mips_GPREL32 desginates R_MIPS_GPREL32+R_MIPS_64 on MIPS64.
S.addFixup(Value, Mips::fixup_Mips_GPREL32);
S.appendContents(8, 0);
@@ -1047,24 +1049,28 @@ void MipsTargetELFStreamer::emitGPRel64Value(const MCExpr *Value) {
void MipsTargetELFStreamer::emitDTPRel32Value(const MCExpr *Value) {
auto &S = getStreamer();
+ S.ensureHeadroom(4);
S.addFixup(Value, Mips::fixup_Mips_DTPREL32);
S.appendContents(4, 0);
}
void MipsTargetELFStreamer::emitDTPRel64Value(const MCExpr *Value) {
auto &S = getStreamer();
+ S.ensureHeadroom(8);
S.addFixup(Value, Mips::fixup_Mips_DTPREL64);
S.appendContents(8, 0);
}
void MipsTargetELFStreamer::emitTPRel32Value(const MCExpr *Value) {
auto &S = getStreamer();
+ S.ensureHeadroom(4);
S.addFixup(Value, Mips::fixup_Mips_TPREL32);
S.appendContents(4, 0);
}
void MipsTargetELFStreamer::emitTPRel64Value(const MCExpr *Value) {
auto &S = getStreamer();
+ S.ensureHeadroom(8);
S.addFixup(Value, Mips::fixup_Mips_TPREL64);
S.appendContents(8, 0);
}
diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
index ca03310..a2e48ab 100644
--- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -737,14 +737,18 @@ void MipsAsmPrinter::emitStartOfAsmFile(Module &M) {
if (FS.empty() && M.size() && F->hasFnAttribute("target-features"))
FS = F->getFnAttribute("target-features").getValueAsString();
+ std::string strFS = FS.str();
+ if (M.size() && F->getFnAttribute("use-soft-float").getValueAsBool())
+ strFS += strFS.empty() ? "+soft-float" : ",+soft-float";
+
// Compute MIPS architecture attributes based on the default subtarget
// that we'd have constructed.
// FIXME: For ifunc related functions we could iterate over and look
// for a feature string that doesn't match the default one.
StringRef CPU = MIPS_MC::selectMipsCPU(TT, TM.getTargetCPU());
const MipsTargetMachine &MTM = static_cast<const MipsTargetMachine &>(TM);
- const MipsSubtarget STI(TT, CPU, FS, MTM.isLittleEndian(), MTM,
- std::nullopt);
+ const MipsSubtarget STI(TT, CPU, StringRef(strFS), MTM.isLittleEndian(),
+ MTM, std::nullopt);
bool IsABICalls = STI.isABICalls();
const MipsABIInfo &ABI = MTM.getABI();
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp
index 0e581a7..ec6b382 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -522,9 +522,6 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
setOperationAction(ISD::TRAP, MVT::Other, Legal);
- setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
- setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
-
setTargetDAGCombine({ISD::SDIVREM, ISD::UDIVREM, ISD::SELECT, ISD::AND,
ISD::OR, ISD::ADD, ISD::SUB, ISD::AssertZext, ISD::SHL,
ISD::SIGN_EXTEND});
@@ -1360,8 +1357,6 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const
case ISD::FP_TO_SINT: return lowerFP_TO_SINT(Op, DAG);
case ISD::READCYCLECOUNTER:
return lowerREADCYCLECOUNTER(Op, DAG);
- case ISD::ConstantFP:
- return lowerConstantFP(Op, DAG);
}
return SDValue();
}
@@ -3019,30 +3014,6 @@ SDValue MipsTargetLowering::lowerFP_TO_SINT(SDValue Op,
return DAG.getNode(ISD::BITCAST, SDLoc(Op), Op.getValueType(), Trunc);
}
-SDValue MipsTargetLowering::lowerConstantFP(SDValue Op,
- SelectionDAG &DAG) const {
- SDLoc DL(Op);
- EVT VT = Op.getSimpleValueType();
- SDNode *N = Op.getNode();
- ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(N);
-
- if (!CFP->isNaN() || Subtarget.isNaN2008()) {
- return SDValue();
- }
-
- APFloat NaNValue = CFP->getValueAPF();
- auto &Sem = NaNValue.getSemantics();
-
- // The MSB of the mantissa should be zero for QNaNs in the MIPS legacy NaN
- // encodings, and one for sNaNs. Check every NaN constants and make sure
- // they are correctly encoded for legacy encodings.
- if (!NaNValue.isSignaling()) {
- APFloat RealQNaN = NaNValue.getSNaN(Sem);
- return DAG.getConstantFP(RealQNaN, DL, VT);
- }
- return SDValue();
-}
-
//===----------------------------------------------------------------------===//
// Calling Convention Implementation
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.h b/llvm/lib/Target/Mips/MipsISelLowering.h
index 31ac5d4..c65c76c 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.h
+++ b/llvm/lib/Target/Mips/MipsISelLowering.h
@@ -592,7 +592,6 @@ class TargetRegisterClass;
SDValue lowerEH_DWARF_CFA(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerREADCYCLECOUNTER(SDValue Op, SelectionDAG &DAG) const;
- SDValue lowerConstantFP(SDValue Op, SelectionDAG &DAG) const;
/// isEligibleForTailCallOptimization - Check whether the call is eligible
/// for tail call optimization.
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 65e7c56..96f52275 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -145,18 +145,6 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
if (tryStoreVector(N))
return;
break;
- case NVPTXISD::LoadParam:
- case NVPTXISD::LoadParamV2:
- case NVPTXISD::LoadParamV4:
- if (tryLoadParam(N))
- return;
- break;
- case NVPTXISD::StoreParam:
- case NVPTXISD::StoreParamV2:
- case NVPTXISD::StoreParamV4:
- if (tryStoreParam(N))
- return;
- break;
case ISD::INTRINSIC_W_CHAIN:
if (tryIntrinsicChain(N))
return;
@@ -1462,267 +1450,6 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
return true;
}
-bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
- SDValue Chain = Node->getOperand(0);
- SDValue Offset = Node->getOperand(2);
- SDValue Glue = Node->getOperand(3);
- SDLoc DL(Node);
- MemSDNode *Mem = cast<MemSDNode>(Node);
-
- unsigned VecSize;
- switch (Node->getOpcode()) {
- default:
- return false;
- case NVPTXISD::LoadParam:
- VecSize = 1;
- break;
- case NVPTXISD::LoadParamV2:
- VecSize = 2;
- break;
- case NVPTXISD::LoadParamV4:
- VecSize = 4;
- break;
- }
-
- EVT EltVT = Node->getValueType(0);
- EVT MemVT = Mem->getMemoryVT();
-
- std::optional<unsigned> Opcode;
-
- switch (VecSize) {
- default:
- return false;
- case 1:
- Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy,
- NVPTX::LoadParamMemI8, NVPTX::LoadParamMemI16,
- NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64);
- break;
- case 2:
- Opcode =
- pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV2I8,
- NVPTX::LoadParamMemV2I16, NVPTX::LoadParamMemV2I32,
- NVPTX::LoadParamMemV2I64);
- break;
- case 4:
- Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy,
- NVPTX::LoadParamMemV4I8, NVPTX::LoadParamMemV4I16,
- NVPTX::LoadParamMemV4I32, {/* no v4i64 */});
- break;
- }
- if (!Opcode)
- return false;
-
- SDVTList VTs;
- if (VecSize == 1) {
- VTs = CurDAG->getVTList(EltVT, MVT::Other, MVT::Glue);
- } else if (VecSize == 2) {
- VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue);
- } else {
- EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue };
- VTs = CurDAG->getVTList(EVTs);
- }
-
- unsigned OffsetVal = Offset->getAsZExtVal();
-
- SmallVector<SDValue, 2> Ops(
- {CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32), Chain, Glue});
-
- ReplaceNode(Node, CurDAG->getMachineNode(*Opcode, DL, VTs, Ops));
- return true;
-}
-
-// Helpers for constructing opcode (ex: NVPTX::StoreParamV4F32_iiri)
-#define getOpcV2H(ty, opKind0, opKind1) \
- NVPTX::StoreParamV2##ty##_##opKind0##opKind1
-
-#define getOpcV2H1(ty, opKind0, isImm1) \
- (isImm1) ? getOpcV2H(ty, opKind0, i) : getOpcV2H(ty, opKind0, r)
-
-#define getOpcodeForVectorStParamV2(ty, isimm) \
- (isimm[0]) ? getOpcV2H1(ty, i, isimm[1]) : getOpcV2H1(ty, r, isimm[1])
-
-#define getOpcV4H(ty, opKind0, opKind1, opKind2, opKind3) \
- NVPTX::StoreParamV4##ty##_##opKind0##opKind1##opKind2##opKind3
-
-#define getOpcV4H3(ty, opKind0, opKind1, opKind2, isImm3) \
- (isImm3) ? getOpcV4H(ty, opKind0, opKind1, opKind2, i) \
- : getOpcV4H(ty, opKind0, opKind1, opKind2, r)
-
-#define getOpcV4H2(ty, opKind0, opKind1, isImm2, isImm3) \
- (isImm2) ? getOpcV4H3(ty, opKind0, opKind1, i, isImm3) \
- : getOpcV4H3(ty, opKind0, opKind1, r, isImm3)
-
-#define getOpcV4H1(ty, opKind0, isImm1, isImm2, isImm3) \
- (isImm1) ? getOpcV4H2(ty, opKind0, i, isImm2, isImm3) \
- : getOpcV4H2(ty, opKind0, r, isImm2, isImm3)
-
-#define getOpcodeForVectorStParamV4(ty, isimm) \
- (isimm[0]) ? getOpcV4H1(ty, i, isimm[1], isimm[2], isimm[3]) \
- : getOpcV4H1(ty, r, isimm[1], isimm[2], isimm[3])
-
-#define getOpcodeForVectorStParam(n, ty, isimm) \
- (n == 2) ? getOpcodeForVectorStParamV2(ty, isimm) \
- : getOpcodeForVectorStParamV4(ty, isimm)
-
-static unsigned pickOpcodeForVectorStParam(SmallVector<SDValue, 8> &Ops,
- unsigned NumElts,
- MVT::SimpleValueType MemTy,
- SelectionDAG *CurDAG, SDLoc DL) {
- // Determine which inputs are registers and immediates make new operators
- // with constant values
- SmallVector<bool, 4> IsImm(NumElts, false);
- for (unsigned i = 0; i < NumElts; i++) {
- IsImm[i] = (isa<ConstantSDNode>(Ops[i]) || isa<ConstantFPSDNode>(Ops[i]));
- if (IsImm[i]) {
- SDValue Imm = Ops[i];
- if (MemTy == MVT::f32 || MemTy == MVT::f64) {
- const ConstantFPSDNode *ConstImm = cast<ConstantFPSDNode>(Imm);
- const ConstantFP *CF = ConstImm->getConstantFPValue();
- Imm = CurDAG->getTargetConstantFP(*CF, DL, Imm->getValueType(0));
- } else {
- const ConstantSDNode *ConstImm = cast<ConstantSDNode>(Imm);
- const ConstantInt *CI = ConstImm->getConstantIntValue();
- Imm = CurDAG->getTargetConstant(*CI, DL, Imm->getValueType(0));
- }
- Ops[i] = Imm;
- }
- }
-
- // Get opcode for MemTy, size, and register/immediate operand ordering
- switch (MemTy) {
- case MVT::i8:
- return getOpcodeForVectorStParam(NumElts, I8, IsImm);
- case MVT::i16:
- return getOpcodeForVectorStParam(NumElts, I16, IsImm);
- case MVT::i32:
- return getOpcodeForVectorStParam(NumElts, I32, IsImm);
- case MVT::i64:
- assert(NumElts == 2 && "MVT too large for NumElts > 2");
- return getOpcodeForVectorStParamV2(I64, IsImm);
- case MVT::f32:
- return getOpcodeForVectorStParam(NumElts, F32, IsImm);
- case MVT::f64:
- assert(NumElts == 2 && "MVT too large for NumElts > 2");
- return getOpcodeForVectorStParamV2(F64, IsImm);
-
- // These cases don't support immediates, just use the all register version
- // and generate moves.
- case MVT::i1:
- return (NumElts == 2) ? NVPTX::StoreParamV2I8_rr
- : NVPTX::StoreParamV4I8_rrrr;
- case MVT::f16:
- case MVT::bf16:
- return (NumElts == 2) ? NVPTX::StoreParamV2I16_rr
- : NVPTX::StoreParamV4I16_rrrr;
- case MVT::v2f16:
- case MVT::v2bf16:
- case MVT::v2i16:
- case MVT::v4i8:
- return (NumElts == 2) ? NVPTX::StoreParamV2I32_rr
- : NVPTX::StoreParamV4I32_rrrr;
- default:
- llvm_unreachable("Cannot select st.param for unknown MemTy");
- }
-}
-
-bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
- SDLoc DL(N);
- SDValue Chain = N->getOperand(0);
- SDValue Param = N->getOperand(1);
- unsigned ParamVal = Param->getAsZExtVal();
- SDValue Offset = N->getOperand(2);
- unsigned OffsetVal = Offset->getAsZExtVal();
- MemSDNode *Mem = cast<MemSDNode>(N);
- SDValue Glue = N->getOperand(N->getNumOperands() - 1);
-
- // How many elements do we have?
- unsigned NumElts;
- switch (N->getOpcode()) {
- default:
- llvm_unreachable("Unexpected opcode");
- case NVPTXISD::StoreParam:
- NumElts = 1;
- break;
- case NVPTXISD::StoreParamV2:
- NumElts = 2;
- break;
- case NVPTXISD::StoreParamV4:
- NumElts = 4;
- break;
- }
-
- // Build vector of operands
- SmallVector<SDValue, 8> Ops;
- for (unsigned i = 0; i < NumElts; ++i)
- Ops.push_back(N->getOperand(i + 3));
- Ops.append({CurDAG->getTargetConstant(ParamVal, DL, MVT::i32),
- CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32), Chain, Glue});
-
- // Determine target opcode
- // If we have an i1, use an 8-bit store. The lowering code in
- // NVPTXISelLowering will have already emitted an upcast.
- std::optional<unsigned> Opcode;
- switch (NumElts) {
- default:
- llvm_unreachable("Unexpected NumElts");
- case 1: {
- MVT::SimpleValueType MemTy = Mem->getMemoryVT().getSimpleVT().SimpleTy;
- SDValue Imm = Ops[0];
- if (MemTy != MVT::f16 && MemTy != MVT::bf16 &&
- (isa<ConstantSDNode>(Imm) || isa<ConstantFPSDNode>(Imm))) {
- // Convert immediate to target constant
- if (MemTy == MVT::f32 || MemTy == MVT::f64) {
- const ConstantFPSDNode *ConstImm = cast<ConstantFPSDNode>(Imm);
- const ConstantFP *CF = ConstImm->getConstantFPValue();
- Imm = CurDAG->getTargetConstantFP(*CF, DL, Imm->getValueType(0));
- } else {
- const ConstantSDNode *ConstImm = cast<ConstantSDNode>(Imm);
- const ConstantInt *CI = ConstImm->getConstantIntValue();
- Imm = CurDAG->getTargetConstant(*CI, DL, Imm->getValueType(0));
- }
- Ops[0] = Imm;
- // Use immediate version of store param
- Opcode =
- pickOpcodeForVT(MemTy, NVPTX::StoreParamI8_i, NVPTX::StoreParamI16_i,
- NVPTX::StoreParamI32_i, NVPTX::StoreParamI64_i);
- } else
- Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
- NVPTX::StoreParamI8_r, NVPTX::StoreParamI16_r,
- NVPTX::StoreParamI32_r, NVPTX::StoreParamI64_r);
- if (Opcode == NVPTX::StoreParamI8_r) {
- // Fine tune the opcode depending on the size of the operand.
- // This helps to avoid creating redundant COPY instructions in
- // InstrEmitter::AddRegisterOperand().
- switch (Ops[0].getSimpleValueType().SimpleTy) {
- default:
- break;
- case MVT::i32:
- Opcode = NVPTX::StoreParamI8TruncI32_r;
- break;
- case MVT::i64:
- Opcode = NVPTX::StoreParamI8TruncI64_r;
- break;
- }
- }
- break;
- }
- case 2:
- case 4: {
- MVT::SimpleValueType MemTy = Mem->getMemoryVT().getSimpleVT().SimpleTy;
- Opcode = pickOpcodeForVectorStParam(Ops, NumElts, MemTy, CurDAG, DL);
- break;
- }
- }
-
- SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
- SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, RetVTs, Ops);
- MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
- CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});
-
- ReplaceNode(N, Ret);
- return true;
-}
-
/// SelectBFE - Look for instruction sequences that can be made more efficient
/// by using the 'bfe' (bit-field extract) PTX instruction
bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index b99b4ef..e504a8f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -78,8 +78,6 @@ private:
bool tryLDG(MemSDNode *N);
bool tryStore(SDNode *N);
bool tryStoreVector(SDNode *N);
- bool tryLoadParam(SDNode *N);
- bool tryStoreParam(SDNode *N);
bool tryFence(SDNode *N);
void SelectAddrSpaceCast(SDNode *N);
bool tryBFE(SDNode *N);
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index ddcecc00..f79b862 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -1075,12 +1075,6 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(NVPTXISD::DeclareArrayParam)
MAKE_CASE(NVPTXISD::DeclareScalarParam)
MAKE_CASE(NVPTXISD::CALL)
- MAKE_CASE(NVPTXISD::LoadParam)
- MAKE_CASE(NVPTXISD::LoadParamV2)
- MAKE_CASE(NVPTXISD::LoadParamV4)
- MAKE_CASE(NVPTXISD::StoreParam)
- MAKE_CASE(NVPTXISD::StoreParamV2)
- MAKE_CASE(NVPTXISD::StoreParamV4)
MAKE_CASE(NVPTXISD::MoveParam)
MAKE_CASE(NVPTXISD::UNPACK_VECTOR)
MAKE_CASE(NVPTXISD::BUILD_VECTOR)
@@ -1318,105 +1312,6 @@ Align NVPTXTargetLowering::getArgumentAlignment(const CallBase *CB, Type *Ty,
return DL.getABITypeAlign(Ty);
}
-static bool adjustElementType(EVT &ElementType) {
- switch (ElementType.getSimpleVT().SimpleTy) {
- default:
- return false;
- case MVT::f16:
- case MVT::bf16:
- ElementType = MVT::i16;
- return true;
- case MVT::f32:
- case MVT::v2f16:
- case MVT::v2bf16:
- ElementType = MVT::i32;
- return true;
- case MVT::f64:
- ElementType = MVT::i64;
- return true;
- }
-}
-
-// Use byte-store when the param address of the argument value is unaligned.
-// This may happen when the return value is a field of a packed structure.
-//
-// This is called in LowerCall() when passing the param values.
-static SDValue LowerUnalignedStoreParam(SelectionDAG &DAG, SDValue Chain,
- uint64_t Offset, EVT ElementType,
- SDValue StVal, SDValue &InGlue,
- unsigned ArgID, const SDLoc &dl) {
- // Bit logic only works on integer types
- if (adjustElementType(ElementType))
- StVal = DAG.getNode(ISD::BITCAST, dl, ElementType, StVal);
-
- // Store each byte
- SDVTList StoreVTs = DAG.getVTList(MVT::Other, MVT::Glue);
- for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
- // Shift the byte to the last byte position
- SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, StVal,
- DAG.getConstant(i * 8, dl, MVT::i32));
- SDValue StoreOperands[] = {Chain, DAG.getConstant(ArgID, dl, MVT::i32),
- DAG.getConstant(Offset + i, dl, MVT::i32),
- ShiftVal, InGlue};
- // Trunc store only the last byte by using
- // st.param.b8
- // The register type can be larger than b8.
- Chain = DAG.getMemIntrinsicNode(
- NVPTXISD::StoreParam, dl, StoreVTs, StoreOperands, MVT::i8,
- MachinePointerInfo(), Align(1), MachineMemOperand::MOStore);
- InGlue = Chain.getValue(1);
- }
- return Chain;
-}
-
-// Use byte-load when the param adress of the returned value is unaligned.
-// This may happen when the returned value is a field of a packed structure.
-static SDValue
-LowerUnalignedLoadRetParam(SelectionDAG &DAG, SDValue &Chain, uint64_t Offset,
- EVT ElementType, SDValue &InGlue,
- SmallVectorImpl<SDValue> &TempProxyRegOps,
- const SDLoc &dl) {
- // Bit logic only works on integer types
- EVT MergedType = ElementType;
- adjustElementType(MergedType);
-
- // Load each byte and construct the whole value. Initial value to 0
- SDValue RetVal = DAG.getConstant(0, dl, MergedType);
- // LoadParamMemI8 loads into i16 register only
- SDVTList LoadVTs = DAG.getVTList(MVT::i16, MVT::Other, MVT::Glue);
- for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
- SDValue LoadOperands[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
- DAG.getConstant(Offset + i, dl, MVT::i32),
- InGlue};
- // This will be selected to LoadParamMemI8
- SDValue LdVal =
- DAG.getMemIntrinsicNode(NVPTXISD::LoadParam, dl, LoadVTs, LoadOperands,
- MVT::i8, MachinePointerInfo(), Align(1));
- SDValue TmpLdVal = LdVal.getValue(0);
- Chain = LdVal.getValue(1);
- InGlue = LdVal.getValue(2);
-
- TmpLdVal = DAG.getNode(NVPTXISD::ProxyReg, dl,
- TmpLdVal.getSimpleValueType(), TmpLdVal);
- TempProxyRegOps.push_back(TmpLdVal);
-
- SDValue CMask = DAG.getConstant(255, dl, MergedType);
- SDValue CShift = DAG.getConstant(i * 8, dl, MVT::i32);
- // Need to extend the i16 register to the whole width.
- TmpLdVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MergedType, TmpLdVal);
- // Mask off the high bits. Leave only the lower 8bits.
- // Do this because we are using loadparam.b8.
- TmpLdVal = DAG.getNode(ISD::AND, dl, MergedType, TmpLdVal, CMask);
- // Shift and merge
- TmpLdVal = DAG.getNode(ISD::SHL, dl, MergedType, TmpLdVal, CShift);
- RetVal = DAG.getNode(ISD::OR, dl, MergedType, RetVal, TmpLdVal);
- }
- if (ElementType != MergedType)
- RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal);
-
- return RetVal;
-}
-
static bool shouldConvertToIndirectCall(const CallBase *CB,
const GlobalAddressSDNode *Func) {
if (!Func)
@@ -1483,10 +1378,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SelectionDAG &DAG = CLI.DAG;
SDLoc dl = CLI.DL;
- SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
- SDValue Chain = CLI.Chain;
+ const SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
SDValue Callee = CLI.Callee;
- bool &isTailCall = CLI.IsTailCall;
ArgListTy &Args = CLI.getArgs();
Type *RetTy = CLI.RetTy;
const CallBase *CB = CLI.CB;
@@ -1496,6 +1389,36 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
return DAG.getConstant(I, dl, MVT::i32);
};
+ const unsigned UniqueCallSite = GlobalUniqueCallSite++;
+ const SDValue CallChain = CLI.Chain;
+ const SDValue StartChain =
+ DAG.getCALLSEQ_START(CallChain, UniqueCallSite, 0, dl);
+ SDValue DeclareGlue = StartChain.getValue(1);
+
+ SmallVector<SDValue, 16> CallPrereqs{StartChain};
+
+ const auto MakeDeclareScalarParam = [&](SDValue Symbol, unsigned Size) {
+ // PTX ABI requires integral types to be at least 32 bits in size. FP16 is
+ // loaded/stored using i16, so it's handled here as well.
+ const unsigned SizeBits = promoteScalarArgumentSize(Size * 8);
+ SDValue Declare =
+ DAG.getNode(NVPTXISD::DeclareScalarParam, dl, {MVT::Other, MVT::Glue},
+ {StartChain, Symbol, GetI32(SizeBits), DeclareGlue});
+ CallPrereqs.push_back(Declare);
+ DeclareGlue = Declare.getValue(1);
+ return Declare;
+ };
+
+ const auto MakeDeclareArrayParam = [&](SDValue Symbol, Align Align,
+ unsigned Size) {
+ SDValue Declare = DAG.getNode(
+ NVPTXISD::DeclareArrayParam, dl, {MVT::Other, MVT::Glue},
+ {StartChain, Symbol, GetI32(Align.value()), GetI32(Size), DeclareGlue});
+ CallPrereqs.push_back(Declare);
+ DeclareGlue = Declare.getValue(1);
+ return Declare;
+ };
+
// Variadic arguments.
//
// Normally, for each argument, we declare a param scalar or a param
@@ -1511,15 +1434,17 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
//
// After all vararg is processed, 'VAOffset' holds the size of the
// vararg byte array.
+ assert((CLI.IsVarArg || CLI.Args.size() == CLI.NumFixedArgs) &&
+ "Non-VarArg function with extra arguments");
- SDValue VADeclareParam; // vararg byte array
const unsigned FirstVAArg = CLI.NumFixedArgs; // position of first variadic
- unsigned VAOffset = 0; // current offset in the param array
+ unsigned VAOffset = 0; // current offset in the param array
- const unsigned UniqueCallSite = GlobalUniqueCallSite++;
- SDValue TempChain = Chain;
- Chain = DAG.getCALLSEQ_START(Chain, UniqueCallSite, 0, dl);
- SDValue InGlue = Chain.getValue(1);
+ const SDValue VADeclareParam =
+ CLI.Args.size() > FirstVAArg
+ ? MakeDeclareArrayParam(getCallParamSymbol(DAG, FirstVAArg, MVT::i32),
+ Align(STI.getMaxRequiredAlignment()), 0)
+ : SDValue();
// Args.size() and Outs.size() need not match.
// Outs.size() will be larger
@@ -1580,43 +1505,19 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
assert((!IsByVal || TypeSize == ArgOuts[0].Flags.getByValSize()) &&
"type size mismatch");
- const std::optional<SDValue> ArgDeclare = [&]() -> std::optional<SDValue> {
- if (IsVAArg) {
- if (ArgI == FirstVAArg) {
- VADeclareParam = DAG.getNode(
- NVPTXISD::DeclareArrayParam, dl, {MVT::Other, MVT::Glue},
- {Chain, ParamSymbol, GetI32(STI.getMaxRequiredAlignment()),
- GetI32(0), InGlue});
- return VADeclareParam;
- }
- return std::nullopt;
- }
- if (IsByVal || shouldPassAsArray(Arg.Ty)) {
- // declare .param .align <align> .b8 .param<n>[<size>];
- return DAG.getNode(NVPTXISD::DeclareArrayParam, dl,
- {MVT::Other, MVT::Glue},
- {Chain, ParamSymbol, GetI32(ArgAlign.value()),
- GetI32(TypeSize), InGlue});
- }
+ const SDValue ArgDeclare = [&]() {
+ if (IsVAArg)
+ return VADeclareParam;
+
+ if (IsByVal || shouldPassAsArray(Arg.Ty))
+ return MakeDeclareArrayParam(ParamSymbol, ArgAlign, TypeSize);
+
assert(ArgOuts.size() == 1 && "We must pass only one value as non-array");
- // declare .param .b<size> .param<n>;
-
- // PTX ABI requires integral types to be at least 32 bits in
- // size. FP16 is loaded/stored using i16, so it's handled
- // here as well.
- const unsigned PromotedSize =
- (ArgOuts[0].VT.isInteger() || ArgOuts[0].VT.isFloatingPoint())
- ? promoteScalarArgumentSize(TypeSize * 8)
- : TypeSize * 8;
-
- return DAG.getNode(NVPTXISD::DeclareScalarParam, dl,
- {MVT::Other, MVT::Glue},
- {Chain, ParamSymbol, GetI32(PromotedSize), InGlue});
+ assert((ArgOuts[0].VT.isInteger() || ArgOuts[0].VT.isFloatingPoint()) &&
+ "Only int and float types are supported as non-array arguments");
+
+ return MakeDeclareScalarParam(ParamSymbol, TypeSize);
}();
- if (ArgDeclare) {
- Chain = ArgDeclare->getValue(0);
- InGlue = ArgDeclare->getValue(1);
- }
// PTX Interoperability Guide 3.3(A): [Integer] Values shorter
// than 32-bits are sign extended or zero extended, depending on
@@ -1626,36 +1527,25 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
Arg.Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Arg.Ty) < 32;
const auto GetStoredValue = [&](const unsigned I, EVT EltVT,
- const Align PartAlign) {
- SDValue StVal;
+ const MaybeAlign PartAlign) {
if (IsByVal) {
SDValue Ptr = ArgOutVals[0];
auto MPI = refinePtrAS(Ptr, DAG, DL, *this);
SDValue SrcAddr =
DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(Offsets[I]));
- StVal = DAG.getLoad(EltVT, dl, TempChain, SrcAddr, MPI, PartAlign);
- } else {
- StVal = ArgOutVals[I];
-
- auto PromotedVT = promoteScalarIntegerPTX(StVal.getValueType());
- if (PromotedVT != StVal.getValueType()) {
- StVal = DAG.getNode(getExtOpcode(ArgOuts[I].Flags), dl, PromotedVT,
- StVal);
- }
+ return DAG.getLoad(EltVT, dl, CallChain, SrcAddr, MPI, PartAlign);
}
+ SDValue StVal = ArgOutVals[I];
+ assert(promoteScalarIntegerPTX(StVal.getValueType()) ==
+ StVal.getValueType() &&
+ "OutVal type should always be legal");
- if (ExtendIntegerParam) {
- assert(VTs.size() == 1 && "Scalar can't have multiple parts.");
- // zext/sext to i32
- StVal =
- DAG.getNode(getExtOpcode(ArgOuts[I].Flags), dl, MVT::i32, StVal);
- } else if (EltVT.getSizeInBits() < 16) {
- // Use 16-bit registers for small stores as it's the
- // smallest general purpose register size supported by NVPTX.
- StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
- }
- return StVal;
+ const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
+ const EVT StoreVT =
+ ExtendIntegerParam ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
+
+ return correctParamType(StVal, StoreVT, ArgOuts[I].Flags, DAG, dl);
};
const auto VectorInfo =
@@ -1664,23 +1554,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
unsigned J = 0;
for (const unsigned NumElts : VectorInfo) {
const int CurOffset = Offsets[J];
- EVT EltVT = promoteScalarIntegerPTX(VTs[J]);
- const Align PartAlign = commonAlignment(ArgAlign, CurOffset);
-
- // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a
- // scalar store. In such cases, fall back to byte stores.
- if (NumElts == 1 && !IsVAArg && PartAlign < DAG.getEVTAlign(EltVT)) {
-
- SDValue StVal = GetStoredValue(J, EltVT, PartAlign);
- Chain = LowerUnalignedStoreParam(DAG, Chain,
- CurOffset + (IsByVal ? VAOffset : 0),
- EltVT, StVal, InGlue, ArgI, dl);
-
- // LowerUnalignedStoreParam took care of inserting the necessary nodes
- // into the SDAG, so just move on to the next element.
- J++;
- continue;
- }
+ const EVT EltVT = promoteScalarIntegerPTX(VTs[J]);
if (IsVAArg && !IsByVal)
// Align each part of the variadic argument to their type.
@@ -1688,44 +1562,45 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
assert((IsVAArg || VAOffset == 0) &&
"VAOffset must be 0 for non-VA args");
- SmallVector<SDValue, 6> StoreOperands{
- Chain, GetI32(IsVAArg ? FirstVAArg : ArgI),
- GetI32(VAOffset + ((IsVAArg && !IsByVal) ? 0 : CurOffset))};
- // Record the values to store.
- for (const unsigned K : llvm::seq(NumElts))
- StoreOperands.push_back(GetStoredValue(J + K, EltVT, PartAlign));
- StoreOperands.push_back(InGlue);
+ const unsigned Offset =
+ (VAOffset + ((IsVAArg && !IsByVal) ? 0 : CurOffset));
+ SDValue Ptr =
+ DAG.getObjectPtrOffset(dl, ParamSymbol, TypeSize::getFixed(Offset));
- NVPTXISD::NodeType Op;
- switch (NumElts) {
- case 1:
- Op = NVPTXISD::StoreParam;
- break;
- case 2:
- Op = NVPTXISD::StoreParamV2;
- break;
- case 4:
- Op = NVPTXISD::StoreParamV4;
- break;
- default:
- llvm_unreachable("Invalid vector info.");
+ const MaybeAlign CurrentAlign = ExtendIntegerParam
+ ? MaybeAlign(std::nullopt)
+ : commonAlignment(ArgAlign, Offset);
+
+ SDValue Val;
+ if (NumElts == 1) {
+ Val = GetStoredValue(J, EltVT, CurrentAlign);
+ } else {
+ SmallVector<SDValue, 8> StoreVals;
+ for (const unsigned K : llvm::seq(NumElts)) {
+ SDValue ValJ = GetStoredValue(J + K, EltVT, CurrentAlign);
+ if (ValJ.getValueType().isVector())
+ DAG.ExtractVectorElements(ValJ, StoreVals);
+ else
+ StoreVals.push_back(ValJ);
+ }
+
+ EVT VT = EVT::getVectorVT(
+ *DAG.getContext(), StoreVals[0].getValueType(), StoreVals.size());
+ Val = DAG.getBuildVector(VT, dl, StoreVals);
}
- // Adjust type of the store op if we've extended the scalar
- // return value.
- EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;
- Chain = DAG.getMemIntrinsicNode(
- Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
- TheStoreType, MachinePointerInfo(), PartAlign,
- MachineMemOperand::MOStore);
- InGlue = Chain.getValue(1);
+ SDValue StoreParam =
+ DAG.getStore(ArgDeclare, dl, Val, Ptr,
+ MachinePointerInfo(ADDRESS_SPACE_PARAM), CurrentAlign);
+ CallPrereqs.push_back(StoreParam);
// TODO: We may need to support vector types that can be passed
// as scalars in variadic arguments.
if (IsVAArg && !IsByVal) {
assert(NumElts == 1 &&
"Vectorization is expected to be disabled for variadics.");
+ const EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;
VAOffset +=
DL.getTypeAllocSize(TheStoreType.getTypeForEVT(*DAG.getContext()));
}
@@ -1736,33 +1611,21 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
VAOffset += TypeSize;
}
- GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
-
// Handle Result
if (!Ins.empty()) {
- const SDValue RetDeclare = [&]() {
- const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32);
- const unsigned ResultSize = DL.getTypeAllocSizeInBits(RetTy);
- if (shouldPassAsArray(RetTy)) {
- const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
- return DAG.getNode(NVPTXISD::DeclareArrayParam, dl,
- {MVT::Other, MVT::Glue},
- {Chain, RetSymbol, GetI32(RetAlign.value()),
- GetI32(ResultSize / 8), InGlue});
- }
- const auto PromotedResultSize = promoteScalarArgumentSize(ResultSize);
- return DAG.getNode(
- NVPTXISD::DeclareScalarParam, dl, {MVT::Other, MVT::Glue},
- {Chain, RetSymbol, GetI32(PromotedResultSize), InGlue});
- }();
- Chain = RetDeclare.getValue(0);
- InGlue = RetDeclare.getValue(1);
+ const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32);
+ const unsigned ResultSize = DL.getTypeAllocSize(RetTy);
+ if (shouldPassAsArray(RetTy)) {
+ const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
+ MakeDeclareArrayParam(RetSymbol, RetAlign, ResultSize);
+ } else {
+ MakeDeclareScalarParam(RetSymbol, ResultSize);
+ }
}
- const bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs);
// Set the size of the vararg param byte array if the callee is a variadic
// function and the variadic part is not empty.
- if (HasVAArgs) {
+ if (VADeclareParam) {
SDValue DeclareParamOps[] = {VADeclareParam.getOperand(0),
VADeclareParam.getOperand(1),
VADeclareParam.getOperand(2), GetI32(VAOffset),
@@ -1771,6 +1634,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
VADeclareParam->getVTList(), DeclareParamOps);
}
+ const auto *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
// If the type of the callsite does not match that of the function, convert
// the callsite to an indirect call.
const bool ConvertToIndirectCall = shouldConvertToIndirectCall(CB, Func);
@@ -1800,15 +1664,16 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// instruction.
// The prototype is embedded in a string and put as the operand for a
// CallPrototype SDNode which will print out to the value of the string.
+ const bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs);
std::string Proto =
getPrototype(DL, RetTy, Args, CLI.Outs,
HasVAArgs ? std::optional(FirstVAArg) : std::nullopt, *CB,
UniqueCallSite);
const char *ProtoStr = nvTM->getStrPool().save(Proto).data();
- Chain = DAG.getNode(
- NVPTXISD::CallPrototype, dl, {MVT::Other, MVT::Glue},
- {Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InGlue});
- InGlue = Chain.getValue(1);
+ const SDValue PrototypeDeclare = DAG.getNode(
+ NVPTXISD::CallPrototype, dl, MVT::Other,
+ {StartChain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32)});
+ CallPrereqs.push_back(PrototypeDeclare);
}
if (ConvertToIndirectCall) {
@@ -1826,24 +1691,15 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
const unsigned NumArgs =
std::min<unsigned>(CLI.NumFixedArgs + 1, Args.size());
/// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns,
- /// NumParams, Callee, Proto, InGlue)
- Chain = DAG.getNode(NVPTXISD::CALL, dl, {MVT::Other, MVT::Glue},
- {Chain, GetI32(CLI.IsConvergent), GetI32(IsIndirectCall),
- GetI32(Ins.empty() ? 0 : 1), GetI32(NumArgs), Callee,
- GetI32(Proto), InGlue});
- InGlue = Chain.getValue(1);
-
+ /// NumParams, Callee, Proto)
+ const SDValue CallToken = DAG.getTokenFactor(dl, CallPrereqs);
+ const SDValue Call = DAG.getNode(
+ NVPTXISD::CALL, dl, MVT::Other,
+ {CallToken, GetI32(CLI.IsConvergent), GetI32(IsIndirectCall),
+ GetI32(Ins.empty() ? 0 : 1), GetI32(NumArgs), Callee, GetI32(Proto)});
+
+ SmallVector<SDValue, 16> LoadChains{Call};
SmallVector<SDValue, 16> ProxyRegOps;
- // An item of the vector is filled if the element does not need a ProxyReg
- // operation on it and should be added to InVals as is. ProxyRegOps and
- // ProxyRegTruncates contain empty/none items at the same index.
- SmallVector<SDValue, 16> RetElts;
- // A temporary ProxyReg operations inserted in `LowerUnalignedLoadRetParam()`
- // to use the values of `LoadParam`s and to be replaced later then
- // `CALLSEQ_END` is added.
- SmallVector<SDValue, 16> TempProxyRegOps;
-
- // Generate loads from param memory/moves from registers for result
if (!Ins.empty()) {
SmallVector<EVT, 16> VTs;
SmallVector<uint64_t, 16> Offsets;
@@ -1860,104 +1716,65 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
const auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
unsigned I = 0;
- for (const unsigned VectorizedSize : VectorInfo) {
- EVT TheLoadType = promoteScalarIntegerPTX(VTs[I]);
- EVT EltType = Ins[I].VT;
- const Align EltAlign = commonAlignment(RetAlign, Offsets[I]);
-
- if (TheLoadType != VTs[I])
- EltType = TheLoadType;
-
- if (ExtendIntegerRetVal) {
- TheLoadType = MVT::i32;
- EltType = MVT::i32;
- } else if (TheLoadType.getSizeInBits() < 16) {
- EltType = MVT::i16;
- }
+ for (const unsigned NumElts : VectorInfo) {
+ const MaybeAlign CurrentAlign =
+ ExtendIntegerRetVal ? MaybeAlign(std::nullopt)
+ : commonAlignment(RetAlign, Offsets[I]);
- // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a
- // scalar load. In such cases, fall back to byte loads.
- if (VectorizedSize == 1 && RetTy->isAggregateType() &&
- EltAlign < DAG.getEVTAlign(TheLoadType)) {
- SDValue Ret = LowerUnalignedLoadRetParam(
- DAG, Chain, Offsets[I], TheLoadType, InGlue, TempProxyRegOps, dl);
- ProxyRegOps.push_back(SDValue());
- RetElts.resize(I);
- RetElts.push_back(Ret);
-
- I++;
- continue;
- }
+ const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
+ const EVT LoadVT =
+ ExtendIntegerRetVal ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
- SmallVector<EVT, 6> LoadVTs(VectorizedSize, EltType);
- LoadVTs.append({MVT::Other, MVT::Glue});
+ const unsigned PackingAmt =
+ LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
- NVPTXISD::NodeType Op;
- switch (VectorizedSize) {
- case 1:
- Op = NVPTXISD::LoadParam;
- break;
- case 2:
- Op = NVPTXISD::LoadParamV2;
- break;
- case 4:
- Op = NVPTXISD::LoadParamV4;
- break;
- default:
- llvm_unreachable("Invalid vector info.");
- }
+ const EVT VecVT = NumElts == 1 ? LoadVT
+ : EVT::getVectorVT(*DAG.getContext(),
+ LoadVT.getScalarType(),
+ NumElts * PackingAmt);
- SDValue LoadOperands[] = {Chain, GetI32(1), GetI32(Offsets[I]), InGlue};
- SDValue RetVal = DAG.getMemIntrinsicNode(
- Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType,
- MachinePointerInfo(), EltAlign, MachineMemOperand::MOLoad);
+ const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32);
+ SDValue Ptr =
+ DAG.getObjectPtrOffset(dl, RetSymbol, TypeSize::getFixed(Offsets[I]));
- for (const unsigned J : llvm::seq(VectorizedSize)) {
- ProxyRegOps.push_back(RetVal.getValue(J));
- }
+ SDValue R =
+ DAG.getLoad(VecVT, dl, Call, Ptr,
+ MachinePointerInfo(ADDRESS_SPACE_PARAM), CurrentAlign);
- Chain = RetVal.getValue(VectorizedSize);
- InGlue = RetVal.getValue(VectorizedSize + 1);
+ LoadChains.push_back(R.getValue(1));
- I += VectorizedSize;
+ if (NumElts == 1)
+ ProxyRegOps.push_back(R);
+ else
+ for (const unsigned J : llvm::seq(NumElts)) {
+ SDValue Elt = DAG.getNode(
+ LoadVT.isVector() ? ISD::EXTRACT_SUBVECTOR
+ : ISD::EXTRACT_VECTOR_ELT,
+ dl, LoadVT, R, DAG.getVectorIdxConstant(J * PackingAmt, dl));
+ ProxyRegOps.push_back(Elt);
+ }
+ I += NumElts;
}
}
- Chain =
- DAG.getCALLSEQ_END(Chain, UniqueCallSite, UniqueCallSite + 1, InGlue, dl);
- InGlue = Chain.getValue(1);
+ const SDValue EndToken = DAG.getTokenFactor(dl, LoadChains);
+ const SDValue CallEnd = DAG.getCALLSEQ_END(EndToken, UniqueCallSite,
+ UniqueCallSite + 1, SDValue(), dl);
// Append ProxyReg instructions to the chain to make sure that `callseq_end`
// will not get lost. Otherwise, during libcalls expansion, the nodes can become
// dangling.
- for (const unsigned I : llvm::seq(ProxyRegOps.size())) {
- if (I < RetElts.size() && RetElts[I]) {
- InVals.push_back(RetElts[I]);
- continue;
- }
-
- SDValue Ret =
- DAG.getNode(NVPTXISD::ProxyReg, dl, ProxyRegOps[I].getSimpleValueType(),
- {Chain, ProxyRegOps[I]});
-
- const EVT ExpectedVT = Ins[I].VT;
- if (!Ret.getValueType().bitsEq(ExpectedVT)) {
- Ret = DAG.getNode(ISD::TRUNCATE, dl, ExpectedVT, Ret);
- }
+ for (const auto [I, Reg] : llvm::enumerate(ProxyRegOps)) {
+ SDValue Proxy =
+ DAG.getNode(NVPTXISD::ProxyReg, dl, Reg.getValueType(), {CallEnd, Reg});
+ SDValue Ret = correctParamType(Proxy, Ins[I].VT, Ins[I].Flags, DAG, dl);
InVals.push_back(Ret);
}
- for (SDValue &T : TempProxyRegOps) {
- SDValue Repl = DAG.getNode(NVPTXISD::ProxyReg, dl, T.getSimpleValueType(),
- {Chain, T.getOperand(0)});
- DAG.ReplaceAllUsesWith(T, Repl);
- DAG.RemoveDeadNode(T.getNode());
- }
-
- // set isTailCall to false for now, until we figure out how to express
+ // set IsTailCall to false for now, until we figure out how to express
// tail call optimization in PTX
- isTailCall = false;
- return Chain;
+ CLI.IsTailCall = false;
+ return CallEnd;
}
SDValue NVPTXTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
@@ -5117,10 +4934,6 @@ combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
Operands.push_back(DCI.DAG.getIntPtrConstant(
cast<LoadSDNode>(LD)->getExtensionType(), DL));
break;
- case NVPTXISD::LoadParamV2:
- OldNumOutputs = 2;
- Opcode = NVPTXISD::LoadParamV4;
- break;
case NVPTXISD::LoadV2:
OldNumOutputs = 2;
Opcode = NVPTXISD::LoadV4;
@@ -5201,12 +5014,6 @@ static SDValue combinePackingMovIntoStore(SDNode *N,
MemVT = ST->getMemoryVT();
Opcode = NVPTXISD::StoreV2;
break;
- case NVPTXISD::StoreParam:
- Opcode = NVPTXISD::StoreParamV2;
- break;
- case NVPTXISD::StoreParamV2:
- Opcode = NVPTXISD::StoreParamV4;
- break;
case NVPTXISD::StoreV2:
MemVT = ST->getMemoryVT();
Opcode = NVPTXISD::StoreV4;
@@ -5218,7 +5025,6 @@ static SDValue combinePackingMovIntoStore(SDNode *N,
return SDValue();
Opcode = NVPTXISD::StoreV8;
break;
- case NVPTXISD::StoreParamV4:
case NVPTXISD::StoreV8:
// PTX doesn't support the next doubling of operands
return SDValue();
@@ -5263,30 +5069,11 @@ static SDValue combinePackingMovIntoStore(SDNode *N,
MemVT, ST->getMemOperand());
}
-static SDValue PerformStoreCombineHelper(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI,
- unsigned Front, unsigned Back) {
- if (all_of(N->ops().drop_front(Front).drop_back(Back),
- [](const SDUse &U) { return U.get()->isUndef(); }))
- // Operand 0 is the previous value in the chain. Cannot return EntryToken
- // as the previous value will become unused and eliminated later.
- return N->getOperand(0);
-
- return combinePackingMovIntoStore(N, DCI, Front, Back);
-}
-
static SDValue PerformStoreCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
return combinePackingMovIntoStore(N, DCI, 1, 2);
}
-static SDValue PerformStoreParamCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI) {
- // Operands from the 3rd to the 2nd last one are the values to be stored.
- // {Chain, ArgID, Offset, Val, Glue}
- return PerformStoreCombineHelper(N, DCI, 3, 1);
-}
-
/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
///
static SDValue PerformADDCombine(SDNode *N,
@@ -5942,6 +5729,86 @@ static SDValue combinePRMT(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
N->getConstantOperandAPInt(2),
N->getConstantOperandVal(3)),
SDLoc(N), N->getValueType(0));
+ return SDValue();
+}
+
+// During call lowering we wrap the return values in a ProxyReg node which
+// depend on the chain value produced by the completed call. This ensures that
+// the full call is emitted in cases where libcalls are used to legalize
+// operations. To improve the functioning of other DAG combines we pull all
+// operations we can through one of these nodes, ensuring that the ProxyReg
+// directly wraps a load. That is:
+//
+// (ProxyReg (zext (load retval0))) => (zext (ProxyReg (load retval0)))
+//
+static SDValue sinkProxyReg(SDValue R, SDValue Chain,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ switch (R.getOpcode()) {
+ case ISD::TRUNCATE:
+ case ISD::ANY_EXTEND:
+ case ISD::SIGN_EXTEND:
+ case ISD::ZERO_EXTEND:
+ case ISD::BITCAST: {
+ if (SDValue V = sinkProxyReg(R.getOperand(0), Chain, DCI))
+ return DCI.DAG.getNode(R.getOpcode(), SDLoc(R), R.getValueType(), V);
+ return SDValue();
+ }
+ case ISD::SHL:
+ case ISD::SRL:
+ case ISD::SRA:
+ case ISD::OR: {
+ if (SDValue A = sinkProxyReg(R.getOperand(0), Chain, DCI))
+ if (SDValue B = sinkProxyReg(R.getOperand(1), Chain, DCI))
+ return DCI.DAG.getNode(R.getOpcode(), SDLoc(R), R.getValueType(), A, B);
+ return SDValue();
+ }
+ case ISD::Constant:
+ return R;
+ case ISD::LOAD:
+ case NVPTXISD::LoadV2:
+ case NVPTXISD::LoadV4: {
+ return DCI.DAG.getNode(NVPTXISD::ProxyReg, SDLoc(R), R.getValueType(),
+ {Chain, R});
+ }
+ case ISD::BUILD_VECTOR: {
+ if (DCI.isBeforeLegalize())
+ return SDValue();
+
+ SmallVector<SDValue, 16> Ops;
+ for (auto &Op : R->ops()) {
+ SDValue V = sinkProxyReg(Op, Chain, DCI);
+ if (!V)
+ return SDValue();
+ Ops.push_back(V);
+ }
+ return DCI.DAG.getNode(ISD::BUILD_VECTOR, SDLoc(R), R.getValueType(), Ops);
+ }
+ case ISD::EXTRACT_VECTOR_ELT: {
+ if (DCI.isBeforeLegalize())
+ return SDValue();
+
+ if (SDValue V = sinkProxyReg(R.getOperand(0), Chain, DCI))
+ return DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(R),
+ R.getValueType(), V, R.getOperand(1));
+ return SDValue();
+ }
+ default:
+ return SDValue();
+ }
+}
+
+static SDValue combineProxyReg(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+
+ SDValue Chain = N->getOperand(0);
+ SDValue Reg = N->getOperand(1);
+
+ // If the ProxyReg is not wrapping a load, try to pull the operations through
+ // the ProxyReg.
+ if (Reg.getOpcode() != ISD::LOAD) {
+ if (SDValue V = sinkProxyReg(Reg, Chain, DCI))
+ return V;
+ }
return SDValue();
}
@@ -5965,7 +5832,6 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::FADD:
return PerformFADDCombine(N, DCI, OptLevel);
case ISD::LOAD:
- case NVPTXISD::LoadParamV2:
case NVPTXISD::LoadV2:
case NVPTXISD::LoadV4:
return combineUnpackingMovIntoLoad(N, DCI);
@@ -5973,6 +5839,8 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
return PerformMULCombine(N, DCI, OptLevel);
case NVPTXISD::PRMT:
return combinePRMT(N, DCI, OptLevel);
+ case NVPTXISD::ProxyReg:
+ return combineProxyReg(N, DCI);
case ISD::SETCC:
return PerformSETCCCombine(N, DCI, STI.getSmVersion());
case ISD::SHL:
@@ -5980,10 +5848,6 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::SREM:
case ISD::UREM:
return PerformREMCombine(N, DCI, OptLevel);
- case NVPTXISD::StoreParam:
- case NVPTXISD::StoreParamV2:
- case NVPTXISD::StoreParamV4:
- return PerformStoreParamCombine(N, DCI);
case ISD::STORE:
case NVPTXISD::StoreV2:
case NVPTXISD::StoreV4:
@@ -6332,6 +6196,22 @@ static void ReplaceCopyFromReg_128(SDNode *N, SelectionDAG &DAG,
Results.push_back(NewValue.getValue(3));
}
+static void replaceProxyReg(SDNode *N, SelectionDAG &DAG,
+ const TargetLowering &TLI,
+ SmallVectorImpl<SDValue> &Results) {
+ SDValue Chain = N->getOperand(0);
+ SDValue Reg = N->getOperand(1);
+
+ MVT VT = TLI.getRegisterType(*DAG.getContext(), Reg.getValueType());
+
+ SDValue NewReg = DAG.getAnyExtOrTrunc(Reg, SDLoc(N), VT);
+ SDValue NewProxy =
+ DAG.getNode(NVPTXISD::ProxyReg, SDLoc(N), VT, {Chain, NewReg});
+ SDValue Res = DAG.getAnyExtOrTrunc(NewProxy, SDLoc(N), N->getValueType(0));
+
+ Results.push_back(Res);
+}
+
void NVPTXTargetLowering::ReplaceNodeResults(
SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
switch (N->getOpcode()) {
@@ -6349,6 +6229,9 @@ void NVPTXTargetLowering::ReplaceNodeResults(
case ISD::CopyFromReg:
ReplaceCopyFromReg_128(N, DAG, Results);
return;
+ case NVPTXISD::ProxyReg:
+ replaceProxyReg(N, DAG, *this, Results);
+ return;
}
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 228e2aa..cf72a1e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -38,7 +38,7 @@ enum NodeType : unsigned {
/// This node represents a PTX call instruction. It's operands are as follows:
///
/// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns,
- /// NumParams, Callee, Proto, InGlue)
+ /// NumParams, Callee, Proto)
CALL,
MoveParam,
@@ -84,13 +84,7 @@ enum NodeType : unsigned {
StoreV2,
StoreV4,
StoreV8,
- LoadParam,
- LoadParamV2,
- LoadParamV4,
- StoreParam,
- StoreParamV2,
- StoreParamV4,
- LAST_MEMORY_OPCODE = StoreParamV4,
+ LAST_MEMORY_OPCODE = StoreV8,
};
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 442b900..86d6f7c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1757,12 +1757,6 @@ def SDTDeclareArrayParam :
SDTypeProfile<0, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>]>;
def SDTDeclareScalarParam :
SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
-def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
-def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>;
-def SDTLoadParamV4Profile : SDTypeProfile<4, 2, [SDTCisInt<4>, SDTCisInt<5>]>;
-def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
-def SDTStoreParamV2Profile : SDTypeProfile<0, 4, [SDTCisInt<0>, SDTCisInt<1>]>;
-def SDTStoreParamV4Profile : SDTypeProfile<0, 6, [SDTCisInt<0>, SDTCisInt<1>]>;
def SDTMoveParamProfile : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisSameAs<0, 1>]>;
def SDTProxyReg : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>]>;
@@ -1774,104 +1768,20 @@ def declare_array_param :
def declare_scalar_param :
SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParam,
[SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-
-def LoadParam :
- SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile,
- [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
-def LoadParamV2 :
- SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile,
- [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
-def LoadParamV4 :
- SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile,
- [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
-def StoreParam :
- SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def StoreParamV2 :
- SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def StoreParamV4 :
- SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
def MoveParam :
SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, []>;
def proxy_reg :
SDNode<"NVPTXISD::ProxyReg", SDTProxyReg, [SDNPHasChain]>;
/// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns,
- /// NumParams, Callee, Proto, InGlue)
+ /// NumParams, Callee, Proto)
def SDTCallProfile : SDTypeProfile<0, 6,
[SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>,
SDTCisVT<3, i32>, SDTCisVT<5, i32>]>;
-def call :
- SDNode<"NVPTXISD::CALL", SDTCallProfile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-
-let mayLoad = true in {
- class LoadParamMemInst<NVPTXRegClass regclass, string opstr> :
- NVPTXInst<(outs regclass:$dst), (ins Offseti32imm:$b),
- !strconcat("ld.param", opstr, " \t$dst, [retval0$b];"),
- []>;
-
- class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> :
- NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins Offseti32imm:$b),
- !strconcat("ld.param.v2", opstr,
- " \t{{$dst, $dst2}}, [retval0$b];"), []>;
-
- class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> :
- NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3,
- regclass:$dst4),
- (ins Offseti32imm:$b),
- !strconcat("ld.param.v4", opstr,
- " \t{{$dst, $dst2, $dst3, $dst4}}, [retval0$b];"),
- []>;
-}
-
-let mayStore = true in {
-
- multiclass StoreParamInst<NVPTXRegClass regclass, Operand IMMType, string opstr, bit support_imm = true> {
- foreach op = [IMMType, regclass] in
- if !or(support_imm, !isa<NVPTXRegClass>(op)) then
- def _ # !if(!isa<NVPTXRegClass>(op), "r", "i")
- : NVPTXInst<(outs),
- (ins op:$val, i32imm:$a, Offseti32imm:$b),
- "st.param" # opstr # " \t[param$a$b], $val;",
- []>;
- }
-
- multiclass StoreParamV2Inst<NVPTXRegClass regclass, Operand IMMType, string opstr> {
- foreach op1 = [IMMType, regclass] in
- foreach op2 = [IMMType, regclass] in
- def _ # !if(!isa<NVPTXRegClass>(op1), "r", "i")
- # !if(!isa<NVPTXRegClass>(op2), "r", "i")
- : NVPTXInst<(outs),
- (ins op1:$val1, op2:$val2,
- i32imm:$a, Offseti32imm:$b),
- "st.param.v2" # opstr # " \t[param$a$b], {{$val1, $val2}};",
- []>;
- }
-
- multiclass StoreParamV4Inst<NVPTXRegClass regclass, Operand IMMType, string opstr> {
- foreach op1 = [IMMType, regclass] in
- foreach op2 = [IMMType, regclass] in
- foreach op3 = [IMMType, regclass] in
- foreach op4 = [IMMType, regclass] in
- def _ # !if(!isa<NVPTXRegClass>(op1), "r", "i")
- # !if(!isa<NVPTXRegClass>(op2), "r", "i")
- # !if(!isa<NVPTXRegClass>(op3), "r", "i")
- # !if(!isa<NVPTXRegClass>(op4), "r", "i")
-
- : NVPTXInst<(outs),
- (ins op1:$val1, op2:$val2, op3:$val3, op4:$val4,
- i32imm:$a, Offseti32imm:$b),
- "st.param.v4" # opstr #
- " \t[param$a$b], {{$val1, $val2, $val3, $val4}};",
- []>;
- }
-}
+def call : SDNode<"NVPTXISD::CALL", SDTCallProfile, [SDNPHasChain, SDNPSideEffect]>;
/// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns,
-/// NumParams, Callee, Proto, InGlue)
+/// NumParams, Callee, Proto)
def CallOperand : Operand<i32> { let PrintMethod = "printCallOperand"; }
@@ -1908,43 +1818,6 @@ foreach is_convergent = [0, 1] in {
(call_uni_inst $addr, imm:$rets, imm:$params)>;
}
-def LoadParamMemI64 : LoadParamMemInst<B64, ".b64">;
-def LoadParamMemI32 : LoadParamMemInst<B32, ".b32">;
-def LoadParamMemI16 : LoadParamMemInst<B16, ".b16">;
-def LoadParamMemI8 : LoadParamMemInst<B16, ".b8">;
-def LoadParamMemV2I64 : LoadParamV2MemInst<B64, ".b64">;
-def LoadParamMemV2I32 : LoadParamV2MemInst<B32, ".b32">;
-def LoadParamMemV2I16 : LoadParamV2MemInst<B16, ".b16">;
-def LoadParamMemV2I8 : LoadParamV2MemInst<B16, ".b8">;
-def LoadParamMemV4I32 : LoadParamV4MemInst<B32, ".b32">;
-def LoadParamMemV4I16 : LoadParamV4MemInst<B16, ".b16">;
-def LoadParamMemV4I8 : LoadParamV4MemInst<B16, ".b8">;
-
-defm StoreParamI64 : StoreParamInst<B64, i64imm, ".b64">;
-defm StoreParamI32 : StoreParamInst<B32, i32imm, ".b32">;
-defm StoreParamI16 : StoreParamInst<B16, i16imm, ".b16">;
-defm StoreParamI8 : StoreParamInst<B16, i8imm, ".b8">;
-
-defm StoreParamI8TruncI32 : StoreParamInst<B32, i8imm, ".b8", /* support_imm */ false>;
-defm StoreParamI8TruncI64 : StoreParamInst<B64, i8imm, ".b8", /* support_imm */ false>;
-
-defm StoreParamV2I64 : StoreParamV2Inst<B64, i64imm, ".b64">;
-defm StoreParamV2I32 : StoreParamV2Inst<B32, i32imm, ".b32">;
-defm StoreParamV2I16 : StoreParamV2Inst<B16, i16imm, ".b16">;
-defm StoreParamV2I8 : StoreParamV2Inst<B16, i8imm, ".b8">;
-
-defm StoreParamV4I32 : StoreParamV4Inst<B32, i32imm, ".b32">;
-defm StoreParamV4I16 : StoreParamV4Inst<B16, i16imm, ".b16">;
-defm StoreParamV4I8 : StoreParamV4Inst<B16, i8imm, ".b8">;
-
-defm StoreParamF32 : StoreParamInst<B32, f32imm, ".b32">;
-defm StoreParamF64 : StoreParamInst<B64, f64imm, ".b64">;
-
-defm StoreParamV2F32 : StoreParamV2Inst<B32, f32imm, ".b32">;
-defm StoreParamV2F64 : StoreParamV2Inst<B64, f64imm, ".b64">;
-
-defm StoreParamV4F32 : StoreParamV4Inst<B32, f32imm, ".b32">;
-
def DECLARE_PARAM_array :
NVPTXInst<(outs), (ins i32imm:$a, i32imm:$align, i32imm:$size),
".param .align $align .b8 \t$a[$size];", []>;
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index 5779d4e..0e8828f 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -243,8 +243,6 @@ public:
createObjectTargetWriter() const override {
return createPPCXCOFFObjectWriter(TT.isArch64Bit());
}
-
- std::optional<MCFixupKind> getFixupKind(StringRef Name) const override;
};
} // end anonymous namespace
@@ -279,13 +277,6 @@ ELFPPCAsmBackend::getFixupKind(StringRef Name) const {
return std::nullopt;
}
-std::optional<MCFixupKind>
-XCOFFPPCAsmBackend::getFixupKind(StringRef Name) const {
- return StringSwitch<std::optional<MCFixupKind>>(Name)
- .Case("R_REF", PPC::fixup_ppc_nofixup)
- .Default(std::nullopt);
-}
-
MCAsmBackend *llvm::createPPCAsmBackend(const Target &T,
const MCSubtargetInfo &STI,
const MCRegisterInfo &MRI,
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
index 9e8ee9f..df0c666 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
@@ -48,8 +48,7 @@ enum Fixups {
/// Not a true fixup, but ties a symbol to a call to __tls_get_addr for the
/// TLS general and local dynamic models, or inserts the thread-pointer
- /// register number. It can also be used to tie the ref symbol to prevent it
- /// from being garbage collected on AIX.
+ /// register number.
fixup_ppc_nofixup,
/// A 16-bit fixup corresponding to lo16(_foo) with implied 3 zero bits for
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
index f75ab62..a04f404 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
@@ -56,6 +56,8 @@ std::pair<uint8_t, uint8_t> PPCXCOFFObjectWriter::getRelocTypeAndSignSize(
switch ((unsigned)Fixup.getKind()) {
default:
report_fatal_error("Unimplemented fixup kind.");
+ case XCOFF::RelocationType::R_REF:
+ return {XCOFF::RelocationType::R_REF, 0};
case PPC::fixup_ppc_half16: {
const uint8_t SignAndSizeForHalf16 = EncodedSignednessIndicator | 15;
switch (Specifier) {
@@ -96,12 +98,6 @@ std::pair<uint8_t, uint8_t> PPCXCOFFObjectWriter::getRelocTypeAndSignSize(
return {XCOFF::RelocationType::R_RBR, EncodedSignednessIndicator | 25};
case PPC::fixup_ppc_br24abs:
return {XCOFF::RelocationType::R_RBA, EncodedSignednessIndicator | 25};
- case PPC::fixup_ppc_nofixup: {
- if (Specifier == PPC::S_None)
- return {XCOFF::RelocationType::R_REF, 0};
- else
- llvm_unreachable("Unsupported Modifier");
- } break;
case FK_Data_4:
case FK_Data_8:
const uint8_t SignAndSizeForFKData =
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index 5e54b82..67cc01e 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -534,16 +534,26 @@ static DecodeStatus decodeRTZArg(MCInst &Inst, uint32_t Imm, int64_t Address,
return MCDisassembler::Success;
}
-static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
static DecodeStatus decodeZcmpRlist(MCInst &Inst, uint32_t Imm,
uint64_t Address,
- const MCDisassembler *Decoder);
+ const MCDisassembler *Decoder) {
+ bool IsRVE = Decoder->getSubtargetInfo().hasFeature(RISCV::FeatureStdExtE);
+ if (Imm < RISCVZC::RA || (IsRVE && Imm >= RISCVZC::RA_S0_S2))
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createImm(Imm));
+ return MCDisassembler::Success;
+}
static DecodeStatus decodeXqccmpRlistS0(MCInst &Inst, uint32_t Imm,
uint64_t Address,
+ const MCDisassembler *Decoder) {
+ if (Imm < RISCVZC::RA_S0)
+ return MCDisassembler::Fail;
+ return decodeZcmpRlist(Inst, Imm, Address, Decoder);
+}
+
+static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn,
+ uint64_t Address,
const MCDisassembler *Decoder);
static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint32_t Insn,
@@ -592,24 +602,6 @@ static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn,
return S;
}
-static DecodeStatus decodeZcmpRlist(MCInst &Inst, uint32_t Imm,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- bool IsRVE = Decoder->getSubtargetInfo().hasFeature(RISCV::FeatureStdExtE);
- if (Imm < RISCVZC::RA || (IsRVE && Imm >= RISCVZC::RA_S0_S2))
- return MCDisassembler::Fail;
- Inst.addOperand(MCOperand::createImm(Imm));
- return MCDisassembler::Success;
-}
-
-static DecodeStatus decodeXqccmpRlistS0(MCInst &Inst, uint32_t Imm,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- if (Imm < RISCVZC::RA_S0)
- return MCDisassembler::Fail;
- return decodeZcmpRlist(Inst, Imm, Address, Decoder);
-}
-
// Add implied SP operand for C.*SP compressed instructions. The SP operand
// isn't explicitly encoded in the instruction.
void RISCVDisassembler::addSPOperands(MCInst &MI) const {
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index b1ab76a..9fc0d81 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -1581,7 +1581,8 @@ void RISCVFrameLowering::determineCalleeSaves(MachineFunction &MF,
// Set the register and all its subregisters.
if (!MRI.def_empty(CSReg) || MRI.getUsedPhysRegsMask().test(CSReg)) {
SavedRegs.set(CSReg);
- llvm::for_each(SubRegs, [&](unsigned Reg) { return SavedRegs.set(Reg); });
+ for (unsigned Reg : SubRegs)
+ SavedRegs.set(Reg);
}
// Combine to super register if all of its subregisters are marked.
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 54845e5..43e4f8e 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2739,6 +2739,27 @@ bool RISCVTargetLowering::isLegalElementTypeForRVV(EVT ScalarTy) const {
}
}
+bool RISCVTargetLowering::isLegalLoadStoreElementTypeForRVV(
+ EVT ScalarTy) const {
+ if (!ScalarTy.isSimple())
+ return false;
+ switch (ScalarTy.getSimpleVT().SimpleTy) {
+ case MVT::iPTR:
+ return Subtarget.is64Bit() ? Subtarget.hasVInstructionsI64() : true;
+ case MVT::i8:
+ case MVT::i16:
+ case MVT::i32:
+ case MVT::f16:
+ case MVT::bf16:
+ case MVT::f32:
+ return true;
+ case MVT::i64:
+ case MVT::f64:
+ return Subtarget.hasVInstructionsI64();
+ default:
+ return false;
+ }
+}
unsigned RISCVTargetLowering::combineRepeatedFPDivisors() const {
return NumRepeatedDivisors;
@@ -20822,6 +20843,62 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
}
break;
}
+ case RISCVISD::TUPLE_EXTRACT: {
+ EVT VT = N->getValueType(0);
+ SDValue Tuple = N->getOperand(0);
+ unsigned Idx = N->getConstantOperandVal(1);
+ if (!Tuple.hasOneUse() || Tuple.getOpcode() != ISD::INTRINSIC_W_CHAIN)
+ break;
+
+ unsigned NF = 0;
+ switch (Tuple.getConstantOperandVal(1)) {
+ default:
+ break;
+ case Intrinsic::riscv_vlseg2_mask:
+ case Intrinsic::riscv_vlseg3_mask:
+ case Intrinsic::riscv_vlseg4_mask:
+ case Intrinsic::riscv_vlseg5_mask:
+ case Intrinsic::riscv_vlseg6_mask:
+ case Intrinsic::riscv_vlseg7_mask:
+ case Intrinsic::riscv_vlseg8_mask:
+ NF = Tuple.getValueType().getRISCVVectorTupleNumFields();
+ break;
+ }
+
+ if (!NF || Subtarget.hasOptimizedSegmentLoadStore(NF))
+ break;
+
+ unsigned SEW = VT.getScalarSizeInBits();
+ assert(Log2_64(SEW) == Tuple.getConstantOperandVal(7) &&
+ "Type mismatch without bitcast?");
+ unsigned Stride = SEW / 8 * NF;
+ unsigned Offset = SEW / 8 * Idx;
+
+ SDValue Ops[] = {
+ /*Chain=*/Tuple.getOperand(0),
+ /*IntID=*/DAG.getTargetConstant(Intrinsic::riscv_vlse_mask, DL, XLenVT),
+ /*Passthru=*/Tuple.getOperand(2),
+ /*Ptr=*/
+ DAG.getNode(ISD::ADD, DL, XLenVT, Tuple.getOperand(3),
+ DAG.getConstant(Offset, DL, XLenVT)),
+ /*Stride=*/DAG.getConstant(Stride, DL, XLenVT),
+ /*Mask=*/Tuple.getOperand(4),
+ /*VL=*/Tuple.getOperand(5),
+ /*Policy=*/Tuple.getOperand(6)};
+
+ auto TupleMemSD = cast<MemIntrinsicSDNode>(Tuple);
+ // Match getTgtMemIntrinsic for non-unit stride case
+ EVT MemVT = TupleMemSD->getMemoryVT().getScalarType();
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ TupleMemSD->getMemOperand(), Offset, MemoryLocation::UnknownSize);
+
+ SDVTList VTs = DAG.getVTList({VT, MVT::Other});
+ SDValue Result = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs,
+ Ops, MemVT, MMO);
+ DAG.ReplaceAllUsesOfValueWith(Tuple.getValue(1), Result.getValue(1));
+ return Result.getValue(0);
+ }
}
return SDValue();
@@ -24239,7 +24316,7 @@ bool RISCVTargetLowering::isLegalStridedLoadStore(EVT DataType,
return false;
EVT ScalarType = DataType.getScalarType();
- if (!isLegalElementTypeForRVV(ScalarType))
+ if (!isLegalLoadStoreElementTypeForRVV(ScalarType))
return false;
if (!Subtarget.enableUnalignedVectorMem() &&
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index ca70c46..a788c0b7 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -384,6 +384,7 @@ public:
bool shouldRemoveExtendFromGSIndex(SDValue Extend, EVT DataVT) const override;
bool isLegalElementTypeForRVV(EVT ScalarTy) const;
+ bool isLegalLoadStoreElementTypeForRVV(EVT ScalarTy) const;
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
index dd365cf..8297d50 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
@@ -136,6 +136,7 @@ class RVPUnary_ri<bits<2> w, bits<5> uf, string opcodestr>
//===----------------------------------------------------------------------===//
let Predicates = [HasStdExtP] in {
+let IsSignExtendingOpW = 1 in
def CLS : Unary_r<0b011000000011, 0b001, "cls">;
def ABS : Unary_r<0b011000000111, 0b001, "abs">;
} // Predicates = [HasStdExtP]
@@ -146,8 +147,10 @@ let Predicates = [HasStdExtP, IsRV64] in {
def REV16 : Unary_r<0b011010110000, 0b101, "rev16">;
def REV_RV64 : Unary_r<0b011010111111, 0b101, "rev">;
+let IsSignExtendingOpW = 1 in {
def CLSW : UnaryW_r<0b011000000011, 0b001, "clsw">;
def ABSW : UnaryW_r<0b011000000111, 0b001, "absw">;
+}
} // Predicates = [HasStdExtP, IsRV64]
let Predicates = [HasStdExtP] in {
diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
index 30d8f85..17e2f01 100644
--- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
@@ -32,7 +32,7 @@ bool RISCVTargetLowering::isLegalInterleavedAccessType(
if (!isTypeLegal(VT))
return false;
- if (!isLegalElementTypeForRVV(VT.getScalarType()) ||
+ if (!isLegalLoadStoreElementTypeForRVV(VT.getScalarType()) ||
!allowsMemoryAccessForAlignment(VTy->getContext(), DL, VT, AddrSpace,
Alignment))
return false;
@@ -216,29 +216,6 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL))
return false;
- // If the segment load is going to be performed segment at a time anyways
- // and there's only one element used, use a strided load instead. This
- // will be equally fast, and create less vector register pressure.
- if (Indices.size() == 1 && !Subtarget.hasOptimizedSegmentLoadStore(Factor)) {
- unsigned ScalarSizeInBytes = DL.getTypeStoreSize(VTy->getElementType());
- Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
- Value *Offset = ConstantInt::get(XLenTy, Indices[0] * ScalarSizeInBytes);
- Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset);
- // For rv64, need to truncate i64 to i32 to match signature. As VL is at most
- // the number of active lanes (which is bounded by i32) this is safe.
- VL = Builder.CreateTrunc(VL, Builder.getInt32Ty());
-
- CallInst *CI =
- Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load,
- {VTy, BasePtr->getType(), Stride->getType()},
- {BasePtr, Stride, Mask, VL});
- Alignment = commonAlignment(Alignment, Indices[0] * ScalarSizeInBytes);
- CI->addParamAttr(0,
- Attribute::getWithAlignment(CI->getContext(), Alignment));
- Shuffles[0]->replaceAllUsesWith(CI);
- return true;
- };
-
CallInst *VlsegN = Builder.CreateIntrinsic(
FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}, {Ptr, Mask, VL});
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index d62d99c..f0510ec 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -265,7 +265,7 @@ public:
if (!ST->enableUnalignedVectorMem() && Alignment < ElemType.getStoreSize())
return false;
- return TLI->isLegalElementTypeForRVV(ElemType);
+ return TLI->isLegalLoadStoreElementTypeForRVV(ElemType);
}
bool isLegalMaskedLoad(Type *DataType, Align Alignment,
@@ -297,7 +297,7 @@ public:
if (!ST->enableUnalignedVectorMem() && Alignment < ElemType.getStoreSize())
return false;
- return TLI->isLegalElementTypeForRVV(ElemType);
+ return TLI->isLegalLoadStoreElementTypeForRVV(ElemType);
}
bool isLegalMaskedGather(Type *DataType, Align Alignment) const override {
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index cd434f7..3f80b2a 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -3436,8 +3436,7 @@ static SDValue performSETCCCombine(SDNode *N,
return SDValue();
}
-static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG) {
- assert(N->getOpcode() == ISD::MUL);
+static SDValue TryWideExtMulCombine(SDNode *N, SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
if (VT != MVT::v8i32 && VT != MVT::v16i32)
return SDValue();
@@ -3523,6 +3522,46 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
+static SDValue performMulCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ assert(N->getOpcode() == ISD::MUL);
+ EVT VT = N->getValueType(0);
+ if (!VT.isVector())
+ return SDValue();
+
+ if (auto Res = TryWideExtMulCombine(N, DCI.DAG))
+ return Res;
+
+ // We don't natively support v16i8 mul, but we do support v8i16 so split the
+ // inputs and extend them to v8i16. Only do this before legalization in case
+ // a narrow vector is widened and may be simplified later.
+ if (!DCI.isBeforeLegalize() || VT != MVT::v16i8)
+ return SDValue();
+
+ SDLoc DL(N);
+ SelectionDAG &DAG = DCI.DAG;
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ SDValue LowLHS =
+ DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MVT::v8i16, LHS);
+ SDValue HighLHS =
+ DAG.getNode(WebAssemblyISD::EXTEND_HIGH_U, DL, MVT::v8i16, LHS);
+ SDValue LowRHS =
+ DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MVT::v8i16, RHS);
+ SDValue HighRHS =
+ DAG.getNode(WebAssemblyISD::EXTEND_HIGH_U, DL, MVT::v8i16, RHS);
+
+ SDValue MulLow =
+ DAG.getBitcast(VT, DAG.getNode(ISD::MUL, DL, MVT::v8i16, LowLHS, LowRHS));
+ SDValue MulHigh = DAG.getBitcast(
+ VT, DAG.getNode(ISD::MUL, DL, MVT::v8i16, HighLHS, HighRHS));
+
+ // Take the low byte of each lane.
+ return DAG.getVectorShuffle(
+ VT, DL, MulLow, MulHigh,
+ {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30});
+}
+
SDValue
WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
@@ -3557,6 +3596,6 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
return performLowerPartialReduction(N, DCI.DAG);
}
case ISD::MUL:
- return performMulCombine(N, DCI.DAG);
+ return performMulCombine(N, DCI);
}
}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index d13862f..143298b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1540,6 +1540,8 @@ multiclass SIMDMADD<Vec vec, bits<32> simdopA, bits<32> simdopS, list<Predicate>
def : Pat<(fadd_contract (vec.vt V128:$a), (fmul_contract (vec.vt V128:$b), (vec.vt V128:$c))),
(!cast<Instruction>("MADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<[HasRelaxedSIMD]>;
+ def : Pat<(fsub_contract (vec.vt V128:$a), (fmul_contract (vec.vt V128:$b), (vec.vt V128:$c))),
+ (!cast<Instruction>("NMADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<[HasRelaxedSIMD]>;
}
defm "" : SIMDMADD<F32x4, 0x105, 0x106, [HasRelaxedSIMD]>;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
index 28f6599..c3990d1 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
@@ -782,6 +782,24 @@ void WebAssemblyLowerEmscriptenEHSjLj::rebuildSSA(Function &F) {
for (Instruction &I : BB) {
if (I.getType()->isVoidTy())
continue;
+
+ if (isa<AllocaInst>(&I)) {
+ // If the alloca has any lifetime marker that is no longer dominated
+ // by the alloca, remove all lifetime markers. Lifetime markers must
+ // always work directly on the alloca, and this is no longer possible.
+ bool HasNonDominatedLifetimeMarker = any_of(I.users(), [&](User *U) {
+ auto *UserI = cast<Instruction>(U);
+ return UserI->isLifetimeStartOrEnd() && !DT.dominates(&I, UserI);
+ });
+ if (HasNonDominatedLifetimeMarker) {
+ for (User *U : make_early_inc_range(I.users())) {
+ auto *UserI = cast<Instruction>(U);
+ if (UserI->isLifetimeStartOrEnd())
+ UserI->eraseFromParent();
+ }
+ }
+ }
+
unsigned VarID = SSA.AddVariable(I.getName(), I.getType());
// If a value is defined by an invoke instruction, it is only available in
// its normal destination and not in its unwind destination.
@@ -1269,10 +1287,20 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) {
// Setjmp preparation
+ SmallVector<AllocaInst *> StaticAllocas;
+ for (Instruction &I : F.getEntryBlock())
+ if (auto *AI = dyn_cast<AllocaInst>(&I))
+ if (AI->isStaticAlloca())
+ StaticAllocas.push_back(AI);
+
BasicBlock *Entry = &F.getEntryBlock();
DebugLoc FirstDL = getOrCreateDebugLoc(&*Entry->begin(), F.getSubprogram());
SplitBlock(Entry, &*Entry->getFirstInsertionPt());
+ // Move static allocas back into the entry block, so they stay static.
+ for (AllocaInst *AI : StaticAllocas)
+ AI->moveBefore(Entry->getTerminator()->getIterator());
+
IRB.SetInsertPoint(Entry->getTerminator()->getIterator());
// This alloca'ed pointer is used by the runtime to identify function
// invocations. It's just for pointer comparisons. It will never be
diff --git a/llvm/lib/TargetParser/RISCVISAInfo.cpp b/llvm/lib/TargetParser/RISCVISAInfo.cpp
index 17c9833..d6afb8a 100644
--- a/llvm/lib/TargetParser/RISCVISAInfo.cpp
+++ b/llvm/lib/TargetParser/RISCVISAInfo.cpp
@@ -858,16 +858,15 @@ void RISCVISAInfo::updateImplication() {
StringRef ExtName = WorkList.pop_back_val();
auto Range = std::equal_range(std::begin(ImpliedExts),
std::end(ImpliedExts), ExtName);
- std::for_each(Range.first, Range.second,
- [&](const ImpliedExtsEntry &Implied) {
- const char *ImpliedExt = Implied.ImpliedExt;
- auto [It, Inserted] = Exts.try_emplace(ImpliedExt);
- if (!Inserted)
- return;
- auto Version = findDefaultVersion(ImpliedExt);
- It->second = *Version;
- WorkList.push_back(ImpliedExt);
- });
+ for (const ImpliedExtsEntry &Implied : llvm::make_range(Range)) {
+ const char *ImpliedExt = Implied.ImpliedExt;
+ auto [It, Inserted] = Exts.try_emplace(ImpliedExt);
+ if (!Inserted)
+ continue;
+ auto Version = findDefaultVersion(ImpliedExt);
+ It->second = *Version;
+ WorkList.push_back(ImpliedExt);
+ }
}
// Add Zcd if C and D are enabled.
diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp
index ee6651c..6acb0bc 100644
--- a/llvm/lib/TargetParser/Triple.cpp
+++ b/llvm/lib/TargetParser/Triple.cpp
@@ -277,6 +277,8 @@ StringRef Triple::getVendorTypeName(VendorType Kind) {
case PC: return "pc";
case SCEI: return "scei";
case SUSE: return "suse";
+ case Meta:
+ return "meta";
}
llvm_unreachable("Invalid VendorType!");
@@ -390,6 +392,8 @@ StringRef Triple::getEnvironmentTypeName(EnvironmentType Kind) {
case OpenHOS: return "ohos";
case PAuthTest:
return "pauthtest";
+ case MTIA:
+ return "mtia";
case LLVM:
return "llvm";
case Mlibc:
@@ -677,6 +681,7 @@ static Triple::VendorType parseVendor(StringRef VendorName) {
.Case("suse", Triple::SUSE)
.Case("oe", Triple::OpenEmbedded)
.Case("intel", Triple::Intel)
+ .Case("meta", Triple::Meta)
.Default(Triple::UnknownVendor);
}
@@ -780,6 +785,7 @@ static Triple::EnvironmentType parseEnvironment(StringRef EnvironmentName) {
.StartsWith("pauthtest", Triple::PAuthTest)
.StartsWith("llvm", Triple::LLVM)
.StartsWith("mlibc", Triple::Mlibc)
+ .StartsWith("mtia", Triple::MTIA)
.Default(Triple::UnknownEnvironment);
}
diff --git a/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp b/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp
index b3910c4..d895cd7 100644
--- a/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp
+++ b/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp
@@ -37,6 +37,16 @@
// memory that ends up in one of the runtime equivalents, since this can
// happen if e.g. a library that was compiled without interposition returns
// an allocation that can be validly passed to `free`.
+//
+// 3. MathFixup (required): Some accelerators might have an incomplete
+// implementation for the intrinsics used to implement some of the math
+// functions in <cmath> / their corresponding libcall lowerings. Since this
+// can vary quite significantly between accelerators, we replace calls to a
+// set of intrinsics / lib functions known to be problematic with calls to a
+// HIPSTDPAR specific forwarding layer, which gives an uniform interface for
+// accelerators to implement in their own runtime components. This pass
+// should run before AcceleratorCodeSelection so as to prevent the spurious
+// removal of the HIPSTDPAR specific forwarding functions.
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/HipStdPar/HipStdPar.h"
@@ -49,6 +59,7 @@
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/Module.h"
#include "llvm/Transforms/Utils/ModuleUtils.h"
@@ -519,3 +530,110 @@ HipStdParAllocationInterpositionPass::run(Module &M, ModuleAnalysisManager&) {
return PreservedAnalyses::none();
}
+
+static constexpr std::pair<StringLiteral, StringLiteral> MathLibToHipStdPar[]{
+ {"acosh", "__hipstdpar_acosh_f64"},
+ {"acoshf", "__hipstdpar_acosh_f32"},
+ {"asinh", "__hipstdpar_asinh_f64"},
+ {"asinhf", "__hipstdpar_asinh_f32"},
+ {"atanh", "__hipstdpar_atanh_f64"},
+ {"atanhf", "__hipstdpar_atanh_f32"},
+ {"cbrt", "__hipstdpar_cbrt_f64"},
+ {"cbrtf", "__hipstdpar_cbrt_f32"},
+ {"erf", "__hipstdpar_erf_f64"},
+ {"erff", "__hipstdpar_erf_f32"},
+ {"erfc", "__hipstdpar_erfc_f64"},
+ {"erfcf", "__hipstdpar_erfc_f32"},
+ {"fdim", "__hipstdpar_fdim_f64"},
+ {"fdimf", "__hipstdpar_fdim_f32"},
+ {"expm1", "__hipstdpar_expm1_f64"},
+ {"expm1f", "__hipstdpar_expm1_f32"},
+ {"hypot", "__hipstdpar_hypot_f64"},
+ {"hypotf", "__hipstdpar_hypot_f32"},
+ {"ilogb", "__hipstdpar_ilogb_f64"},
+ {"ilogbf", "__hipstdpar_ilogb_f32"},
+ {"lgamma", "__hipstdpar_lgamma_f64"},
+ {"lgammaf", "__hipstdpar_lgamma_f32"},
+ {"log1p", "__hipstdpar_log1p_f64"},
+ {"log1pf", "__hipstdpar_log1p_f32"},
+ {"logb", "__hipstdpar_logb_f64"},
+ {"logbf", "__hipstdpar_logb_f32"},
+ {"nextafter", "__hipstdpar_nextafter_f64"},
+ {"nextafterf", "__hipstdpar_nextafter_f32"},
+ {"nexttoward", "__hipstdpar_nexttoward_f64"},
+ {"nexttowardf", "__hipstdpar_nexttoward_f32"},
+ {"remainder", "__hipstdpar_remainder_f64"},
+ {"remainderf", "__hipstdpar_remainder_f32"},
+ {"remquo", "__hipstdpar_remquo_f64"},
+ {"remquof", "__hipstdpar_remquo_f32"},
+ {"scalbln", "__hipstdpar_scalbln_f64"},
+ {"scalblnf", "__hipstdpar_scalbln_f32"},
+ {"scalbn", "__hipstdpar_scalbn_f64"},
+ {"scalbnf", "__hipstdpar_scalbn_f32"},
+ {"tgamma", "__hipstdpar_tgamma_f64"},
+ {"tgammaf", "__hipstdpar_tgamma_f32"}};
+
+PreservedAnalyses HipStdParMathFixupPass::run(Module &M,
+ ModuleAnalysisManager &) {
+ if (M.empty())
+ return PreservedAnalyses::all();
+
+ SmallVector<std::pair<Function *, std::string>> ToReplace;
+ for (auto &&F : M) {
+ if (!F.hasName())
+ continue;
+
+ StringRef N = F.getName();
+ Intrinsic::ID ID = F.getIntrinsicID();
+
+ switch (ID) {
+ case Intrinsic::not_intrinsic: {
+ auto It =
+ find_if(MathLibToHipStdPar, [&](auto &&M) { return M.first == N; });
+ if (It == std::cend(MathLibToHipStdPar))
+ continue;
+ ToReplace.emplace_back(&F, It->second);
+ break;
+ }
+ case Intrinsic::acos:
+ case Intrinsic::asin:
+ case Intrinsic::atan:
+ case Intrinsic::atan2:
+ case Intrinsic::cosh:
+ case Intrinsic::modf:
+ case Intrinsic::sinh:
+ case Intrinsic::tan:
+ case Intrinsic::tanh:
+ break;
+ default: {
+ if (F.getReturnType()->isDoubleTy()) {
+ switch (ID) {
+ case Intrinsic::cos:
+ case Intrinsic::exp:
+ case Intrinsic::exp2:
+ case Intrinsic::log:
+ case Intrinsic::log10:
+ case Intrinsic::log2:
+ case Intrinsic::pow:
+ case Intrinsic::sin:
+ break;
+ default:
+ continue;
+ }
+ break;
+ }
+ continue;
+ }
+ }
+
+ ToReplace.emplace_back(&F, N);
+ llvm::replace(ToReplace.back().second, '.', '_');
+ StringRef Prefix = "llvm";
+ ToReplace.back().second.replace(0, Prefix.size(), "__hipstdpar");
+ }
+ for (auto &&[F, NewF] : ToReplace)
+ F->replaceAllUsesWith(
+ M.getOrInsertFunction(NewF, F->getFunctionType()).getCallee());
+
+ return PreservedAnalyses::none();
+}
diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
index f43202e..8262c8c 100644
--- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -1863,7 +1863,6 @@ void AttributeInferer::run(const SCCNodeSet &SCCNodes,
struct SCCNodesResult {
SCCNodeSet SCCNodes;
- bool HasUnknownCall;
};
} // end anonymous namespace
@@ -2227,29 +2226,13 @@ static void addWillReturn(const SCCNodeSet &SCCNodes,
static SCCNodesResult createSCCNodeSet(ArrayRef<Function *> Functions) {
SCCNodesResult Res;
- Res.HasUnknownCall = false;
for (Function *F : Functions) {
if (!F || F->hasOptNone() || F->hasFnAttribute(Attribute::Naked) ||
F->isPresplitCoroutine()) {
- // Treat any function we're trying not to optimize as if it were an
- // indirect call and omit it from the node set used below.
- Res.HasUnknownCall = true;
+ // Omit any functions we're trying not to optimize from the set.
continue;
}
- // Track whether any functions in this SCC have an unknown call edge.
- // Note: if this is ever a performance hit, we can common it with
- // subsequent routines which also do scans over the instructions of the
- // function.
- if (!Res.HasUnknownCall) {
- for (Instruction &I : instructions(*F)) {
- if (auto *CB = dyn_cast<CallBase>(&I)) {
- if (!CB->getCalledFunction()) {
- Res.HasUnknownCall = true;
- break;
- }
- }
- }
- }
+
Res.SCCNodes.insert(F);
}
return Res;
@@ -2282,15 +2265,10 @@ deriveAttrsInPostOrder(ArrayRef<Function *> Functions, AARGetterT &&AARGetter,
addColdAttrs(Nodes.SCCNodes, Changed);
addWillReturn(Nodes.SCCNodes, Changed);
addNoUndefAttrs(Nodes.SCCNodes, Changed);
-
- // If we have no external nodes participating in the SCC, we can deduce some
- // more precise attributes as well.
- if (!Nodes.HasUnknownCall) {
- addNoAliasAttrs(Nodes.SCCNodes, Changed);
- addNonNullAttrs(Nodes.SCCNodes, Changed);
- inferAttrsFromFunctionBodies(Nodes.SCCNodes, Changed);
- addNoRecurseAttrs(Nodes.SCCNodes, Changed);
- }
+ addNoAliasAttrs(Nodes.SCCNodes, Changed);
+ addNonNullAttrs(Nodes.SCCNodes, Changed);
+ inferAttrsFromFunctionBodies(Nodes.SCCNodes, Changed);
+ addNoRecurseAttrs(Nodes.SCCNodes, Changed);
// Finally, infer the maximal set of attributes from the ones we've inferred
// above. This is handling the cases where one attribute on a signature
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 0164fcd..c009c1e 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -97,6 +97,8 @@ STATISTIC(MissingAllocForContextId,
"Number of missing alloc nodes for context ids");
STATISTIC(SkippedCallsCloning,
"Number of calls skipped during cloning due to unexpected operand");
+STATISTIC(MismatchedCloneAssignments,
+ "Number of callsites assigned to call multiple non-matching clones");
static cl::opt<std::string> DotFilePathPrefix(
"memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
@@ -730,7 +732,7 @@ private:
/// of the functions tracked calls to their new versions in the CallMap.
/// Assigns new clones to clone number CloneNo.
FuncInfo cloneFunctionForCallsite(
- FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
+ FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
@@ -897,7 +899,7 @@ private:
CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
Instruction *>::FuncInfo
cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
- std::map<CallInfo, CallInfo> &CallMap,
+ DenseMap<CallInfo, CallInfo> &CallMap,
std::vector<CallInfo> &CallsWithMetadataInFunc,
unsigned CloneNo);
std::string getLabel(const Function *Func, const Instruction *Call,
@@ -989,7 +991,7 @@ private:
CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
IndexCall>::FuncInfo
cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
- std::map<CallInfo, CallInfo> &CallMap,
+ DenseMap<CallInfo, CallInfo> &CallMap,
std::vector<CallInfo> &CallsWithMetadataInFunc,
unsigned CloneNo);
std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
@@ -2060,6 +2062,20 @@ static bool isMemProfClone(const Function &F) {
return F.getName().contains(MemProfCloneSuffix);
}
+// Return the clone number of the given function by extracting it from the
+// memprof suffix. Assumes the caller has already confirmed it is a memprof
+// clone.
+static unsigned getMemProfCloneNum(const Function &F) {
+ assert(isMemProfClone(F));
+ auto Pos = F.getName().find_last_of('.');
+ assert(Pos > 0);
+ unsigned CloneNo;
+ bool Err = F.getName().drop_front(Pos + 1).getAsInteger(10, CloneNo);
+ assert(!Err);
+ (void)Err;
+ return CloneNo;
+}
+
std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
const Instruction *Call,
unsigned CloneNo) const {
@@ -3979,7 +3995,22 @@ IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
FuncInfo CalleeFunc) {
- if (CalleeFunc.cloneNo() > 0)
+ auto *CurF = cast<CallBase>(CallerCall.call())->getCalledFunction();
+ auto NewCalleeCloneNo = CalleeFunc.cloneNo();
+ if (isMemProfClone(*CurF)) {
+ // If we already assigned this callsite to call a specific non-default
+ // clone (i.e. not the original function which is clone 0), ensure that we
+ // aren't trying to now update it to call a different clone, which is
+ // indicative of a bug in the graph or function assignment.
+ auto CurCalleeCloneNo = getMemProfCloneNum(*CurF);
+ if (CurCalleeCloneNo != NewCalleeCloneNo) {
+ LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was "
+ << CurCalleeCloneNo << " now " << NewCalleeCloneNo
+ << "\n");
+ MismatchedCloneAssignments++;
+ }
+ }
+ if (NewCalleeCloneNo > 0)
cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
OREGetter(CallerCall.call()->getFunction())
.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
@@ -3995,7 +4026,19 @@ void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
assert(CI &&
"Caller cannot be an allocation which should not have profiled calls");
assert(CI->Clones.size() > CallerCall.cloneNo());
- CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo();
+ auto NewCalleeCloneNo = CalleeFunc.cloneNo();
+ auto &CurCalleeCloneNo = CI->Clones[CallerCall.cloneNo()];
+ // If we already assigned this callsite to call a specific non-default
+ // clone (i.e. not the original function which is clone 0), ensure that we
+ // aren't trying to now update it to call a different clone, which is
+ // indicative of a bug in the graph or function assignment.
+ if (CurCalleeCloneNo != 0 && CurCalleeCloneNo != NewCalleeCloneNo) {
+ LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was "
+ << CurCalleeCloneNo << " now " << NewCalleeCloneNo
+ << "\n");
+ MismatchedCloneAssignments++;
+ }
+ CurCalleeCloneNo = NewCalleeCloneNo;
}
// Update the debug information attached to NewFunc to use the clone Name. Note
@@ -4019,7 +4062,7 @@ static void updateSubprogramLinkageName(Function *NewFunc, StringRef Name) {
CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
Instruction *>::FuncInfo
ModuleCallsiteContextGraph::cloneFunctionForCallsite(
- FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
+ FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
// Use existing LLVM facilities for cloning and obtaining Call in clone
ValueToValueMapTy VMap;
@@ -4042,7 +4085,7 @@ ModuleCallsiteContextGraph::cloneFunctionForCallsite(
CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
IndexCall>::FuncInfo
IndexCallsiteContextGraph::cloneFunctionForCallsite(
- FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
+ FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
// Check how many clones we have of Call (and therefore function).
// The next clone number is the current size of versions array.
@@ -4457,14 +4500,24 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
};
+ // Information for a single clone of this Func.
+ struct FuncCloneInfo {
+ // The function clone.
+ FuncInfo FuncClone;
+ // Remappings of each call of interest (from original uncloned call to the
+ // corresponding cloned call in this function clone).
+ DenseMap<CallInfo, CallInfo> CallMap;
+ };
+
// Walk all functions for which we saw calls with memprof metadata, and handle
// cloning for each of its calls.
for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
FuncInfo OrigFunc(Func);
- // Map from each clone of OrigFunc to a map of remappings of each call of
- // interest (from original uncloned call to the corresponding cloned call in
- // that function clone).
- std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap;
+ // Map from each clone number of OrigFunc to information about that function
+ // clone (the function clone FuncInfo and call remappings). The index into
+ // the vector is the clone number, as function clones are created and
+ // numbered sequentially.
+ std::vector<FuncCloneInfo> FuncCloneInfos;
for (auto &Call : CallsWithMetadata) {
ContextNode *Node = getNodeForInst(Call);
// Skip call if we do not have a node for it (all uses of its stack ids
@@ -4488,8 +4541,9 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
// Record the clone of callsite node assigned to this function clone.
FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
- assert(FuncClonesToCallMap.count(FuncClone));
- std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone];
+ assert(FuncCloneInfos.size() > FuncClone.cloneNo());
+ DenseMap<CallInfo, CallInfo> &CallMap =
+ FuncCloneInfos[FuncClone.cloneNo()].CallMap;
CallInfo CallClone(Call);
if (auto It = CallMap.find(Call); It != CallMap.end())
CallClone = It->second;
@@ -4528,10 +4582,10 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
// than existing function clones, which would have been assigned to an
// earlier clone in the list (we assign callsite clones to function
// clones greedily).
- if (FuncClonesToCallMap.size() < NodeCloneCount) {
+ if (FuncCloneInfos.size() < NodeCloneCount) {
// If this is the first callsite copy, assign to original function.
if (NodeCloneCount == 1) {
- // Since FuncClonesToCallMap is empty in this case, no clones have
+ // Since FuncCloneInfos is empty in this case, no clones have
// been created for this function yet, and no callers should have
// been assigned a function clone for this callee node yet.
assert(llvm::none_of(
@@ -4540,7 +4594,8 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
}));
// Initialize with empty call map, assign Clone to original function
// and its callers, and skip to the next clone.
- FuncClonesToCallMap[OrigFunc] = {};
+ FuncCloneInfos.push_back(
+ {OrigFunc, DenseMap<CallInfo, CallInfo>()});
AssignCallsiteCloneToFuncClone(
OrigFunc, Call, Clone,
AllocationCallToContextNodeMap.count(Call));
@@ -4572,14 +4627,14 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
}
// Clone function and save it along with the CallInfo map created
- // during cloning in the FuncClonesToCallMap.
- std::map<CallInfo, CallInfo> NewCallMap;
- unsigned CloneNo = FuncClonesToCallMap.size();
+ // during cloning in the FuncCloneInfos.
+ DenseMap<CallInfo, CallInfo> NewCallMap;
+ unsigned CloneNo = FuncCloneInfos.size();
assert(CloneNo > 0 && "Clone 0 is the original function, which "
"should already exist in the map");
FuncInfo NewFuncClone = cloneFunctionForCallsite(
OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo);
- FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap));
+ FuncCloneInfos.push_back({NewFuncClone, std::move(NewCallMap)});
FunctionClonesAnalysis++;
Changed = true;
@@ -4680,8 +4735,8 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
// CallMap is set up as indexed by original Call at clone 0.
CallInfo OrigCall(Callee->getOrigNode()->Call);
OrigCall.setCloneNo(0);
- std::map<CallInfo, CallInfo> &CallMap =
- FuncClonesToCallMap[NewFuncClone];
+ DenseMap<CallInfo, CallInfo> &CallMap =
+ FuncCloneInfos[NewFuncClone.cloneNo()].CallMap;
assert(CallMap.count(OrigCall));
CallInfo NewCall(CallMap[OrigCall]);
assert(NewCall);
@@ -4703,6 +4758,19 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
// where the callers were assigned to different clones of a function.
}
+ auto FindFirstAvailFuncClone = [&]() {
+ // Find first function in FuncCloneInfos without an assigned
+ // clone of this callsite Node. We should always have one
+ // available at this point due to the earlier cloning when the
+ // FuncCloneInfos size was smaller than the clone number.
+ for (auto &CF : FuncCloneInfos) {
+ if (!FuncCloneToCurNodeCloneMap.count(CF.FuncClone))
+ return CF.FuncClone;
+ }
+ llvm_unreachable(
+ "Expected an available func clone for this callsite clone");
+ };
+
// See if we can use existing function clone. Walk through
// all caller edges to see if any have already been assigned to
// a clone of this callsite's function. If we can use it, do so. If not,
@@ -4819,16 +4887,7 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
// clone of OrigFunc for another caller during this iteration over
// its caller edges.
if (!FuncCloneAssignedToCurCallsiteClone) {
- // Find first function in FuncClonesToCallMap without an assigned
- // clone of this callsite Node. We should always have one
- // available at this point due to the earlier cloning when the
- // FuncClonesToCallMap size was smaller than the clone number.
- for (auto &CF : FuncClonesToCallMap) {
- if (!FuncCloneToCurNodeCloneMap.count(CF.first)) {
- FuncCloneAssignedToCurCallsiteClone = CF.first;
- break;
- }
- }
+ FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone();
assert(FuncCloneAssignedToCurCallsiteClone);
// Assign Clone to FuncCloneAssignedToCurCallsiteClone
AssignCallsiteCloneToFuncClone(
@@ -4842,6 +4901,31 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
FuncCloneAssignedToCurCallsiteClone);
}
}
+ // If we didn't assign a function clone to this callsite clone yet, e.g.
+ // none of its callers has a non-null call, do the assignment here.
+ // We want to ensure that every callsite clone is assigned to some
+ // function clone, so that the call updates below work as expected.
+ // In particular if this is the original callsite, we want to ensure it
+ // is assigned to the original function, otherwise the original function
+ // will appear available for assignment to other callsite clones,
+ // leading to unintended effects. For one, the unknown and not updated
+ // callers will call into cloned paths leading to the wrong hints,
+ // because they still call the original function (clone 0). Also,
+ // because all callsites start out as being clone 0 by default, we can't
+ // easily distinguish between callsites explicitly assigned to clone 0
+ // vs those never assigned, which can lead to multiple updates of the
+ // calls when invoking updateCall below, with mismatched clone values.
+ // TODO: Add a flag to the callsite nodes or some other mechanism to
+ // better distinguish and identify callsite clones that are not getting
+ // assigned to function clones as expected.
+ if (!FuncCloneAssignedToCurCallsiteClone) {
+ FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone();
+ assert(FuncCloneAssignedToCurCallsiteClone &&
+ "No available func clone for this callsite clone");
+ AssignCallsiteCloneToFuncClone(
+ FuncCloneAssignedToCurCallsiteClone, Call, Clone,
+ /*IsAlloc=*/AllocationCallToContextNodeMap.contains(Call));
+ }
}
if (VerifyCCG) {
checkNode<DerivedCCG, FuncTy, CallTy>(Node);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 033ef8b..a43a6ee 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -708,10 +708,14 @@ static Instruction *shrinkSplatShuffle(TruncInst &Trunc,
auto *Shuf = dyn_cast<ShuffleVectorInst>(Trunc.getOperand(0));
if (Shuf && Shuf->hasOneUse() && match(Shuf->getOperand(1), m_Undef()) &&
all_equal(Shuf->getShuffleMask()) &&
- Shuf->getType() == Shuf->getOperand(0)->getType()) {
+ ElementCount::isKnownGE(Shuf->getType()->getElementCount(),
+ cast<VectorType>(Shuf->getOperand(0)->getType())
+ ->getElementCount())) {
// trunc (shuf X, Undef, SplatMask) --> shuf (trunc X), Poison, SplatMask
// trunc (shuf X, Poison, SplatMask) --> shuf (trunc X), Poison, SplatMask
- Value *NarrowOp = Builder.CreateTrunc(Shuf->getOperand(0), Trunc.getType());
+ Type *NewTruncTy = Shuf->getOperand(0)->getType()->getWithNewType(
+ Trunc.getType()->getScalarType());
+ Value *NarrowOp = Builder.CreateTrunc(Shuf->getOperand(0), NewTruncTy);
return new ShuffleVectorInst(NarrowOp, Shuf->getShuffleMask());
}
diff --git a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
index f6780c0..ce1d9f1 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
@@ -456,7 +456,7 @@ static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI,
if (DisableMemOPOPT)
return false;
- if (F.hasFnAttribute(Attribute::OptimizeForSize))
+ if (F.hasOptSize())
return false;
MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT, TLI);
MemOPSizeOpt.perform();
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 0f63ed0..9b87180 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -1360,13 +1360,10 @@ struct DSEState {
/// indicating whether \p I is a free-like call.
std::optional<std::pair<MemoryLocation, bool>>
getLocForTerminator(Instruction *I) const {
- uint64_t Len;
- Value *Ptr;
- if (match(I, m_Intrinsic<Intrinsic::lifetime_end>(m_ConstantInt(Len),
- m_Value(Ptr))))
- return {std::make_pair(MemoryLocation(Ptr, Len), false)};
-
if (auto *CB = dyn_cast<CallBase>(I)) {
+ if (CB->getIntrinsicID() == Intrinsic::lifetime_end)
+ return {
+ std::make_pair(MemoryLocation::getForArgument(CB, 1, &TLI), false)};
if (Value *FreedOp = getFreedOperand(CB, &TLI))
return {std::make_pair(MemoryLocation::getAfter(FreedOp), true)};
}
diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
index d6bd92d..b5eb647 100644
--- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
@@ -1176,6 +1176,28 @@ private:
return true;
}
+ /// This function fixes PHI nodes after fusion in \p SafeToSink.
+ /// \p SafeToSink instructions are the instructions that are to be moved past
+ /// the fused loop. Thus, the PHI nodes in \p SafeToSink should be updated to
+ /// receive values from the fused loop if they are currently taking values
+ /// from the first loop (i.e. FC0)'s latch.
+ void fixPHINodes(ArrayRef<Instruction *> SafeToSink,
+ const FusionCandidate &FC0,
+ const FusionCandidate &FC1) const {
+ for (Instruction *Inst : SafeToSink) {
+ // No update needed for non-PHI nodes.
+ PHINode *Phi = dyn_cast<PHINode>(Inst);
+ if (!Phi)
+ continue;
+ for (unsigned I = 0; I < Phi->getNumIncomingValues(); I++) {
+ if (Phi->getIncomingBlock(I) != FC0.Latch)
+ continue;
+ assert(FC1.Latch && "FC1 latch is not set");
+ Phi->setIncomingBlock(I, FC1.Latch);
+ }
+ }
+ }
+
/// Collect instructions in the \p FC1 Preheader that can be hoisted
/// to the \p FC0 Preheader or sunk into the \p FC1 Body
bool collectMovablePreheaderInsts(
@@ -1481,6 +1503,9 @@ private:
assert(I->getParent() == FC1.Preheader);
I->moveBefore(*FC1.ExitBlock, FC1.ExitBlock->getFirstInsertionPt());
}
+ // PHI nodes in SinkInsts need to be updated to receive values from the
+ // fused loop.
+ fixPHINodes(SinkInsts, FC0, FC1);
}
/// Determine if two fusion candidates have identical guards
diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp
index 6a3f656..1a52af1 100644
--- a/llvm/lib/Transforms/Scalar/NewGVN.cpp
+++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -651,7 +651,7 @@ class NewGVN {
BitVector TouchedInstructions;
DenseMap<const BasicBlock *, std::pair<unsigned, unsigned>> BlockInstRange;
- mutable DenseMap<const IntrinsicInst *, const Value *> IntrinsicInstPred;
+ mutable DenseMap<const IntrinsicInst *, const Value *> PredicateSwapChoice;
#ifndef NDEBUG
// Debugging for how many times each block and instruction got processed.
@@ -840,7 +840,7 @@ private:
// Ranking
unsigned int getRank(const Value *) const;
bool shouldSwapOperands(const Value *, const Value *) const;
- bool shouldSwapOperandsForIntrinsic(const Value *, const Value *,
+ bool shouldSwapOperandsForPredicate(const Value *, const Value *,
const IntrinsicInst *I) const;
// Reachability handling.
@@ -1624,7 +1624,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(IntrinsicInst *I) const {
Value *AdditionallyUsedValue = CmpOp0;
// Sort the ops.
- if (shouldSwapOperandsForIntrinsic(FirstOp, SecondOp, I)) {
+ if (shouldSwapOperandsForPredicate(FirstOp, SecondOp, I)) {
std::swap(FirstOp, SecondOp);
Predicate = CmpInst::getSwappedPredicate(Predicate);
AdditionallyUsedValue = CmpOp1;
@@ -3024,7 +3024,7 @@ void NewGVN::cleanupTables() {
PredicateToUsers.clear();
MemoryToUsers.clear();
RevisitOnReachabilityChange.clear();
- IntrinsicInstPred.clear();
+ PredicateSwapChoice.clear();
}
// Assign local DFS number mapping to instructions, and leave space for Value
@@ -4250,20 +4250,18 @@ bool NewGVN::shouldSwapOperands(const Value *A, const Value *B) const {
return std::make_pair(getRank(A), A) > std::make_pair(getRank(B), B);
}
-bool NewGVN::shouldSwapOperandsForIntrinsic(const Value *A, const Value *B,
+bool NewGVN::shouldSwapOperandsForPredicate(const Value *A, const Value *B,
const IntrinsicInst *I) const {
- auto LookupResult = IntrinsicInstPred.find(I);
if (shouldSwapOperands(A, B)) {
- if (LookupResult == IntrinsicInstPred.end())
- IntrinsicInstPred.insert({I, B});
- else
- LookupResult->second = B;
+ PredicateSwapChoice[I] = B;
return true;
}
- if (LookupResult != IntrinsicInstPred.end()) {
+ auto LookupResult = PredicateSwapChoice.find(I);
+ if (LookupResult != PredicateSwapChoice.end()) {
auto *SeenPredicate = LookupResult->second;
if (SeenPredicate) {
+ // We previously decided to swap B to the left. Keep that choice.
if (SeenPredicate == B)
return true;
else
diff --git a/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp b/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
index 9fe655e..fca09c6 100644
--- a/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
+++ b/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
@@ -498,7 +498,7 @@ bool LibCallsShrinkWrap::perform(CallInst *CI) {
static bool runImpl(Function &F, const TargetLibraryInfo &TLI,
DominatorTree *DT) {
- if (F.hasFnAttribute(Attribute::OptimizeForSize))
+ if (F.hasOptSize())
return false;
DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
LibCallsShrinkWrap CCDCE(TLI, DTU);
diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index b9292af..b78c702 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -703,6 +703,7 @@ private:
// Add U as additional user of V.
void addAdditionalUser(Value *V, User *U) { AdditionalUsers[V].insert(U); }
+ void handlePredicate(Instruction *I, Value *CopyOf, const PredicateBase *PI);
void handleCallOverdefined(CallBase &CB);
void handleCallResult(CallBase &CB);
void handleCallArguments(CallBase &CB);
@@ -1927,6 +1928,75 @@ void SCCPInstVisitor::handleCallArguments(CallBase &CB) {
}
}
+void SCCPInstVisitor::handlePredicate(Instruction *I, Value *CopyOf,
+ const PredicateBase *PI) {
+ ValueLatticeElement CopyOfVal = getValueState(CopyOf);
+ const std::optional<PredicateConstraint> &Constraint = PI->getConstraint();
+ if (!Constraint) {
+ mergeInValue(ValueState[I], I, CopyOfVal);
+ return;
+ }
+
+ CmpInst::Predicate Pred = Constraint->Predicate;
+ Value *OtherOp = Constraint->OtherOp;
+
+ // Wait until OtherOp is resolved.
+ if (getValueState(OtherOp).isUnknown()) {
+ addAdditionalUser(OtherOp, I);
+ return;
+ }
+
+ ValueLatticeElement CondVal = getValueState(OtherOp);
+ ValueLatticeElement &IV = ValueState[I];
+ if (CondVal.isConstantRange() || CopyOfVal.isConstantRange()) {
+ auto ImposedCR =
+ ConstantRange::getFull(DL.getTypeSizeInBits(CopyOf->getType()));
+
+ // Get the range imposed by the condition.
+ if (CondVal.isConstantRange())
+ ImposedCR = ConstantRange::makeAllowedICmpRegion(
+ Pred, CondVal.getConstantRange());
+
+ // Combine range info for the original value with the new range from the
+ // condition.
+ auto CopyOfCR = CopyOfVal.asConstantRange(CopyOf->getType(),
+ /*UndefAllowed=*/true);
+ // Treat an unresolved input like a full range.
+ if (CopyOfCR.isEmptySet())
+ CopyOfCR = ConstantRange::getFull(CopyOfCR.getBitWidth());
+ auto NewCR = ImposedCR.intersectWith(CopyOfCR);
+ // If the existing information is != x, do not use the information from
+ // a chained predicate, as the != x information is more likely to be
+ // helpful in practice.
+ if (!CopyOfCR.contains(NewCR) && CopyOfCR.getSingleMissingElement())
+ NewCR = CopyOfCR;
+
+ // The new range is based on a branch condition. That guarantees that
+ // neither of the compare operands can be undef in the branch targets,
+ // unless we have conditions that are always true/false (e.g. icmp ule
+ // i32, %a, i32_max). For the latter overdefined/empty range will be
+ // inferred, but the branch will get folded accordingly anyways.
+ addAdditionalUser(OtherOp, I);
+ mergeInValue(
+ IV, I, ValueLatticeElement::getRange(NewCR, /*MayIncludeUndef*/ false));
+ return;
+ } else if (Pred == CmpInst::ICMP_EQ &&
+ (CondVal.isConstant() || CondVal.isNotConstant())) {
+ // For non-integer values or integer constant expressions, only
+ // propagate equal constants or not-constants.
+ addAdditionalUser(OtherOp, I);
+ mergeInValue(IV, I, CondVal);
+ return;
+ } else if (Pred == CmpInst::ICMP_NE && CondVal.isConstant()) {
+ // Propagate inequalities.
+ addAdditionalUser(OtherOp, I);
+ mergeInValue(IV, I, ValueLatticeElement::getNot(CondVal.getConstant()));
+ return;
+ }
+
+ return (void)mergeInValue(IV, I, CopyOfVal);
+}
+
void SCCPInstVisitor::handleCallResult(CallBase &CB) {
Function *F = CB.getCalledFunction();
@@ -1936,77 +2006,10 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) {
return;
Value *CopyOf = CB.getOperand(0);
- ValueLatticeElement CopyOfVal = getValueState(CopyOf);
- const auto *PI = getPredicateInfoFor(&CB);
+ const PredicateBase *PI = getPredicateInfoFor(&CB);
assert(PI && "Missing predicate info for ssa.copy");
-
- const std::optional<PredicateConstraint> &Constraint =
- PI->getConstraint();
- if (!Constraint) {
- mergeInValue(ValueState[&CB], &CB, CopyOfVal);
- return;
- }
-
- CmpInst::Predicate Pred = Constraint->Predicate;
- Value *OtherOp = Constraint->OtherOp;
-
- // Wait until OtherOp is resolved.
- if (getValueState(OtherOp).isUnknown()) {
- addAdditionalUser(OtherOp, &CB);
- return;
- }
-
- ValueLatticeElement CondVal = getValueState(OtherOp);
- ValueLatticeElement &IV = ValueState[&CB];
- if (CondVal.isConstantRange() || CopyOfVal.isConstantRange()) {
- auto ImposedCR =
- ConstantRange::getFull(DL.getTypeSizeInBits(CopyOf->getType()));
-
- // Get the range imposed by the condition.
- if (CondVal.isConstantRange())
- ImposedCR = ConstantRange::makeAllowedICmpRegion(
- Pred, CondVal.getConstantRange());
-
- // Combine range info for the original value with the new range from the
- // condition.
- auto CopyOfCR = CopyOfVal.asConstantRange(CopyOf->getType(),
- /*UndefAllowed=*/true);
- // Treat an unresolved input like a full range.
- if (CopyOfCR.isEmptySet())
- CopyOfCR = ConstantRange::getFull(CopyOfCR.getBitWidth());
- auto NewCR = ImposedCR.intersectWith(CopyOfCR);
- // If the existing information is != x, do not use the information from
- // a chained predicate, as the != x information is more likely to be
- // helpful in practice.
- if (!CopyOfCR.contains(NewCR) && CopyOfCR.getSingleMissingElement())
- NewCR = CopyOfCR;
-
- // The new range is based on a branch condition. That guarantees that
- // neither of the compare operands can be undef in the branch targets,
- // unless we have conditions that are always true/false (e.g. icmp ule
- // i32, %a, i32_max). For the latter overdefined/empty range will be
- // inferred, but the branch will get folded accordingly anyways.
- addAdditionalUser(OtherOp, &CB);
- mergeInValue(
- IV, &CB,
- ValueLatticeElement::getRange(NewCR, /*MayIncludeUndef*/ false));
- return;
- } else if (Pred == CmpInst::ICMP_EQ &&
- (CondVal.isConstant() || CondVal.isNotConstant())) {
- // For non-integer values or integer constant expressions, only
- // propagate equal constants or not-constants.
- addAdditionalUser(OtherOp, &CB);
- mergeInValue(IV, &CB, CondVal);
- return;
- } else if (Pred == CmpInst::ICMP_NE && CondVal.isConstant()) {
- // Propagate inequalities.
- addAdditionalUser(OtherOp, &CB);
- mergeInValue(IV, &CB,
- ValueLatticeElement::getNot(CondVal.getConstant()));
- return;
- }
-
- return (void)mergeInValue(IV, &CB, CopyOfVal);
+ handlePredicate(&CB, CopyOf, PI);
+ return;
}
if (II->getIntrinsicID() == Intrinsic::vscale) {
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index ddb062b..571fa11 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -1257,7 +1257,7 @@ Value *SCEVExpander::tryToReuseLCSSAPhi(const SCEVAddRecExpr *S) {
assert(Diff->getType()->isIntegerTy() &&
"difference must be of integer type");
Value *DiffV = expand(Diff);
- Value *BaseV = &PN;
+ Value *BaseV = fixupLCSSAFormFor(&PN);
if (PhiTy->isPointerTy()) {
if (STy->isPointerTy())
return Builder.CreatePtrAdd(BaseV, DiffV);
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6616e61f..40916d1 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -93,6 +93,7 @@
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ScalarEvolutionPatternMatch.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
@@ -155,6 +156,7 @@
#include <utility>
using namespace llvm;
+using namespace SCEVPatternMatch;
#define LV_NAME "loop-vectorize"
#define DEBUG_TYPE LV_NAME
@@ -418,7 +420,24 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
/// ElementCount to include loops whose trip count is a function of vscale.
static ElementCount getSmallConstantTripCount(ScalarEvolution *SE,
const Loop *L) {
- return ElementCount::getFixed(SE->getSmallConstantTripCount(L));
+ if (unsigned ExpectedTC = SE->getSmallConstantTripCount(L))
+ return ElementCount::getFixed(ExpectedTC);
+
+ const SCEV *BTC = SE->getBackedgeTakenCount(L);
+ if (isa<SCEVCouldNotCompute>(BTC))
+ return ElementCount::getFixed(0);
+
+ const SCEV *ExitCount = SE->getTripCountFromExitCount(BTC, BTC->getType(), L);
+ if (isa<SCEVVScale>(ExitCount))
+ return ElementCount::getScalable(1);
+
+ const APInt *Scale;
+ if (match(ExitCount, m_scev_Mul(m_scev_APInt(Scale), m_SCEVVScale())))
+ if (cast<SCEVMulExpr>(ExitCount)->hasNoUnsignedWrap())
+ if (Scale->getActiveBits() <= 32)
+ return ElementCount::getScalable(Scale->getZExtValue());
+
+ return ElementCount::getFixed(0);
}
/// Returns "best known" trip count, which is either a valid positive trip count
@@ -1363,11 +1382,15 @@ public:
TTI.hasActiveVectorLength() && !EnableVPlanNativePath;
if (EVLIsLegal)
return;
- // If for some reason EVL mode is unsupported, fallback to
- // DataWithoutLaneMask to try to vectorize the loop with folded tail
- // in a generic way.
- ChosenTailFoldingStyle = {TailFoldingStyle::DataWithoutLaneMask,
- TailFoldingStyle::DataWithoutLaneMask};
+ // If for some reason EVL mode is unsupported, fallback to a scalar epilogue
+ // if it's allowed, or DataWithoutLaneMask otherwise.
+ if (ScalarEpilogueStatus == CM_ScalarEpilogueAllowed ||
+ ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate)
+ ChosenTailFoldingStyle = {TailFoldingStyle::None, TailFoldingStyle::None};
+ else
+ ChosenTailFoldingStyle = {TailFoldingStyle::DataWithoutLaneMask,
+ TailFoldingStyle::DataWithoutLaneMask};
+
LLVM_DEBUG(
dbgs() << "LV: Preference for VP intrinsics indicated. Will "
"not try to generate VP Intrinsics "
@@ -2589,12 +2612,12 @@ static void cse(BasicBlock *BB) {
}
}
-/// This function attempts to return a value that represents the vectorization
-/// factor at runtime. For fixed-width VFs we know this precisely at compile
+/// This function attempts to return a value that represents the ElementCount
+/// at runtime. For fixed-width VFs we know this precisely at compile
/// time, but for scalable VFs we calculate it based on an estimate of the
/// vscale value.
-static unsigned getEstimatedRuntimeVF(ElementCount VF,
- std::optional<unsigned> VScale) {
+static unsigned estimateElementCount(ElementCount VF,
+ std::optional<unsigned> VScale) {
unsigned EstimatedVF = VF.getKnownMinValue();
if (VF.isScalable())
if (VScale)
@@ -2704,7 +2727,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
// use the value of vscale used for tuning.
Loop *VectorLoop = LI->getLoopFor(HeaderBB);
unsigned EstimatedVFxUF =
- getEstimatedRuntimeVF(VF * UF, Cost->getVScaleForTuning());
+ estimateElementCount(VF * UF, Cost->getVScaleForTuning());
setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop, EstimatedVFxUF);
}
@@ -4333,7 +4356,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
unsigned Width =
- getEstimatedRuntimeVF(Candidate.Width, CM.getVScaleForTuning());
+ estimateElementCount(Candidate.Width, CM.getVScaleForTuning());
LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
<< " costs: " << (Candidate.Cost / Width));
if (VF.isScalable())
@@ -4441,7 +4464,7 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
? EpilogueVectorizationMinVF
: TTI.getEpilogueVectorizationMinVF();
- return getEstimatedRuntimeVF(VF * Multiplier, VScaleForTuning) >=
+ return estimateElementCount(VF * Multiplier, VScaleForTuning) >=
MinVFThreshold;
}
@@ -4494,25 +4517,23 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
// the main loop handles 8 lanes per iteration. We could still benefit from
// vectorizing the epilogue loop with VF=4.
ElementCount EstimatedRuntimeVF = ElementCount::getFixed(
- getEstimatedRuntimeVF(MainLoopVF, CM.getVScaleForTuning()));
+ estimateElementCount(MainLoopVF, CM.getVScaleForTuning()));
ScalarEvolution &SE = *PSE.getSE();
Type *TCType = Legal->getWidestInductionType();
const SCEV *RemainingIterations = nullptr;
unsigned MaxTripCount = 0;
- if (MainLoopVF.isFixed()) {
- // TODO: extend to support scalable VFs.
- const SCEV *TC = vputils::getSCEVExprForVPValue(
- getPlanFor(MainLoopVF).getTripCount(), SE);
- assert(!isa<SCEVCouldNotCompute>(TC) &&
- "Trip count SCEV must be computable");
- RemainingIterations = SE.getURemExpr(
- TC, SE.getConstant(TCType, MainLoopVF.getFixedValue() * IC));
-
- // No iterations left to process in the epilogue.
- if (RemainingIterations->isZero())
- return Result;
+ const SCEV *TC =
+ vputils::getSCEVExprForVPValue(getPlanFor(MainLoopVF).getTripCount(), SE);
+ assert(!isa<SCEVCouldNotCompute>(TC) && "Trip count SCEV must be computable");
+ RemainingIterations =
+ SE.getURemExpr(TC, SE.getElementCount(TCType, MainLoopVF * IC));
+
+ // No iterations left to process in the epilogue.
+ if (RemainingIterations->isZero())
+ return Result;
+ if (MainLoopVF.isFixed()) {
MaxTripCount = MainLoopVF.getFixedValue() * IC - 1;
if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations,
SE.getConstant(TCType, MaxTripCount))) {
@@ -4743,16 +4764,20 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
}
- unsigned EstimatedVF = getEstimatedRuntimeVF(VF, VScaleForTuning);
-
// Try to get the exact trip count, or an estimate based on profiling data or
// ConstantMax from PSE, failing that.
- if (auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop)) {
+ auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop);
+
+ // For fixed length VFs treat a scalable trip count as unknown.
+ if (BestKnownTC && (BestKnownTC->isFixed() || VF.isScalable())) {
+ // Re-evaluate trip counts and VFs to be in the same numerical space.
+ unsigned AvailableTC = estimateElementCount(*BestKnownTC, VScaleForTuning);
+ unsigned EstimatedVF = estimateElementCount(VF, VScaleForTuning);
+
// At least one iteration must be scalar when this constraint holds. So the
// maximum available iterations for interleaving is one less.
- unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
- ? BestKnownTC->getFixedValue() - 1
- : BestKnownTC->getFixedValue();
+ if (requiresScalarEpilogue(VF.isVector()))
+ --AvailableTC;
unsigned InterleaveCountLB = bit_floor(std::max(
1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
@@ -6923,7 +6948,7 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
// Now compute and add the VPlan-based cost.
Cost += Plan.cost(VF, CostCtx);
#ifndef NDEBUG
- unsigned EstimatedWidth = getEstimatedRuntimeVF(VF, CM.getVScaleForTuning());
+ unsigned EstimatedWidth = estimateElementCount(VF, CM.getVScaleForTuning());
LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
<< " (Estimated cost per lane: ");
if (Cost.isValid()) {
@@ -9609,7 +9634,7 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
// For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
// the computations are performed on doubles, not integers and the result
// is rounded up, hence we get an upper estimate of the TC.
- unsigned IntVF = getEstimatedRuntimeVF(VF.Width, VScale);
+ unsigned IntVF = estimateElementCount(VF.Width, VScale);
uint64_t RtC = TotalCost.getValue();
uint64_t Div = ScalarC * IntVF - VF.Cost.getValue();
uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index eaf65e5..a5de593 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1841,6 +1841,10 @@ public:
getGEPNoWrapFlags(), getDebugLoc());
}
+ /// Return true if this VPVectorPointerRecipe corresponds to part 0. Note that
+ /// this is only accurate after the VPlan has been unrolled.
+ bool isFirstPart() const { return getUnrollPart(*this) == 0; }
+
/// Return the cost of this VPHeaderPHIRecipe.
InstructionCost computeCost(ElementCount VF,
VPCostContext &Ctx) const override {
@@ -4061,6 +4065,10 @@ public:
/// Returns VF * UF of the vector loop region.
VPValue &getVFxUF() { return VFxUF; }
+ LLVMContext &getContext() const {
+ return getScalarHeader()->getIRBasicBlock()->getContext();
+ }
+
void addVF(ElementCount VF) { VFs.insert(VF); }
void setVF(ElementCount VF) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 6e566e4..6c1f53b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -566,15 +566,15 @@ void VPlanTransforms::prepareForVectorization(
// the corresponding compare because they may have ended up with different
// line numbers and we want to avoid awkward line stepping while debugging.
// E.g., if the compare has got a line number inside the loop.
- DebugLoc LatchDL = TheLoop->getLoopLatch()->getTerminator()->getDebugLoc();
+ DebugLoc LatchDL = LatchVPBB->getTerminator()->getDebugLoc();
VPBuilder Builder(MiddleVPBB);
VPValue *Cmp;
if (!RequiresScalarEpilogueCheck)
- Cmp = Plan.getOrAddLiveIn(ConstantInt::getFalse(
- IntegerType::getInt1Ty(TripCount->getType()->getContext())));
+ Cmp = Plan.getOrAddLiveIn(
+ ConstantInt::getFalse(IntegerType::getInt1Ty(Plan.getContext())));
else if (TailFolded)
- Cmp = Plan.getOrAddLiveIn(ConstantInt::getTrue(
- IntegerType::getInt1Ty(TripCount->getType()->getContext())));
+ Cmp = Plan.getOrAddLiveIn(
+ ConstantInt::getTrue(IntegerType::getInt1Ty(Plan.getContext())));
else
Cmp = Builder.createICmp(CmpInst::ICMP_EQ, Plan.getTripCount(),
&Plan.getVectorTripCount(), LatchDL, "cmp.n");
@@ -648,7 +648,7 @@ void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond,
.createNaryOp(VPInstruction::BranchOnCond, {CondVPV},
Plan.getCanonicalIV()->getDebugLoc());
if (AddBranchWeights) {
- MDBuilder MDB(Plan.getScalarHeader()->getIRBasicBlock()->getContext());
+ MDBuilder MDB(Plan.getContext());
MDNode *BranchWeights =
MDB.createBranchWeights(CheckBypassWeights, /*IsExpected=*/false);
Term->addMetadata(LLVMContext::MD_prof, BranchWeights);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index f979fcd..68e7c20 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -427,6 +427,7 @@ unsigned VPUnrollPartAccessor<PartOpIdx>::getUnrollPart(const VPUser &U) const {
}
namespace llvm {
+template class VPUnrollPartAccessor<1>;
template class VPUnrollPartAccessor<2>;
template class VPUnrollPartAccessor<3>;
}
@@ -3390,12 +3391,7 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
// must use intrinsics to interleave.
if (VecTy->isScalableTy()) {
assert(Factor <= 8 && "Unsupported interleave factor for scalable vectors");
- VectorType *InterleaveTy =
- VectorType::get(VecTy->getElementType(),
- VecTy->getElementCount().multiplyCoefficientBy(Factor));
- return Builder.CreateIntrinsic(InterleaveTy,
- getInterleaveIntrinsicID(Factor), Vals,
- /*FMFSource=*/nullptr, Name);
+ return Builder.CreateVectorInterleave(Vals, Name);
}
// Fixed length. Start by concatenating all vectors into a wide vector.
@@ -3502,8 +3498,8 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
assert(InterleaveFactor <= 8 &&
"Unsupported deinterleave factor for scalable vectors");
NewLoad = State.Builder.CreateIntrinsic(
- getDeinterleaveIntrinsicID(InterleaveFactor), NewLoad->getType(),
- NewLoad,
+ Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor),
+ NewLoad->getType(), NewLoad,
/*FMFSource=*/nullptr, "strided.vec");
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 650897e..8de05c1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1094,6 +1094,10 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
if (match(Def, m_c_Mul(m_VPValue(A), m_SpecificInt(1))))
return Def->replaceAllUsesWith(A);
+ if (match(Def, m_c_Mul(m_VPValue(A), m_SpecificInt(0))))
+ return Def->replaceAllUsesWith(R.getOperand(0) == A ? R.getOperand(1)
+ : R.getOperand(0));
+
if (match(Def, m_Not(m_VPValue(A)))) {
if (match(A, m_Not(m_VPValue(A))))
return Def->replaceAllUsesWith(A);
@@ -1172,6 +1176,14 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
if (!Plan->isUnrolled())
return;
+ // VPVectorPointer for part 0 can be replaced by their start pointer.
+ if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(&R)) {
+ if (VecPtr->isFirstPart()) {
+ VecPtr->replaceAllUsesWith(VecPtr->getOperand(0));
+ return;
+ }
+ }
+
// VPScalarIVSteps for part 0 can be replaced by their start value, if only
// the first lane is demanded.
if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
@@ -1362,7 +1374,7 @@ static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan,
unsigned NewBitWidth =
ComputeBitWidth(TC->getValue(), BestVF.getKnownMinValue() * BestUF);
- LLVMContext &Ctx = Plan.getCanonicalIV()->getScalarType()->getContext();
+ LLVMContext &Ctx = Plan.getContext();
auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
bool MadeChange = false;
@@ -2511,8 +2523,8 @@ void VPlanTransforms::createInterleaveGroups(
DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
IG->getIndex(IRInsertPos),
/*IsSigned=*/true);
- VPValue *OffsetVPV = Plan.getOrAddLiveIn(
- ConstantInt::get(IRInsertPos->getParent()->getContext(), -Offset));
+ VPValue *OffsetVPV =
+ Plan.getOrAddLiveIn(ConstantInt::get(Plan.getContext(), -Offset));
VPBuilder B(InsertPos);
Addr = InBounds ? B.createInBoundsPtrAdd(InsertPos->getAddr(), OffsetVPV)
: B.createPtrAdd(InsertPos->getAddr(), OffsetVPV);
@@ -3369,7 +3381,7 @@ void VPlanTransforms::addBranchWeightToMiddleTerminator(
if (VF.isScalable() && VScaleForTuning.has_value())
VectorStep *= *VScaleForTuning;
assert(VectorStep > 0 && "trip count should not be zero");
- MDBuilder MDB(Plan.getScalarHeader()->getIRBasicBlock()->getContext());
+ MDBuilder MDB(Plan.getContext());
MDNode *BranchWeights =
MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
MiddleTerm->addMetadata(LLVMContext::MD_prof, BranchWeights);