aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Analysis/Analysis.cpp1
-rw-r--r--llvm/lib/Analysis/CMakeLists.txt1
-rw-r--r--llvm/lib/Analysis/DXILMetadataAnalysis.cpp16
-rw-r--r--llvm/lib/Analysis/LoopAccessAnalysis.cpp44
-rw-r--r--llvm/lib/Analysis/RuntimeLibcallInfo.cpp43
-rw-r--r--llvm/lib/Analysis/TargetLibraryInfo.cpp6
-rw-r--r--llvm/lib/Analysis/ValueTracking.cpp128
-rw-r--r--llvm/lib/Analysis/VectorUtils.cpp11
-rw-r--r--llvm/lib/BinaryFormat/CMakeLists.txt1
-rw-r--r--llvm/lib/BinaryFormat/Minidump.cpp14
-rw-r--r--llvm/lib/Bitcode/Reader/BitcodeReader.cpp36
-rw-r--r--llvm/lib/Bitcode/Writer/BitcodeWriter.cpp2
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp3
-rw-r--r--llvm/lib/CodeGen/AtomicExpandPass.cpp19
-rw-r--r--llvm/lib/CodeGen/CMakeLists.txt1
-rw-r--r--llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp32
-rw-r--r--llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp78
-rw-r--r--llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp2
-rw-r--r--llvm/lib/CodeGen/LibcallLoweringInfo.cpp26
-rw-r--r--llvm/lib/CodeGen/MachineFunctionSplitter.cpp3
-rw-r--r--llvm/lib/CodeGen/MachineInstrBundle.cpp42
-rw-r--r--llvm/lib/CodeGen/MachineScheduler.cpp4
-rw-r--r--llvm/lib/CodeGen/MachineVerifier.cpp8
-rw-r--r--llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp2
-rw-r--r--llvm/lib/CodeGen/ReachingDefAnalysis.cpp3
-rw-r--r--llvm/lib/CodeGen/RegisterCoalescer.cpp24
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp2
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp109
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp31
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp151
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h11
-rw-r--r--llvm/lib/CodeGen/TargetLoweringBase.cpp8
-rw-r--r--llvm/lib/DebugInfo/DWARF/DWARFDie.cpp4
-rw-r--r--llvm/lib/Demangle/MicrosoftDemangle.cpp50
-rw-r--r--llvm/lib/Demangle/MicrosoftDemangleNodes.cpp4
-rw-r--r--llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryResolver.cpp33
-rw-r--r--llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryScanner.cpp2
-rw-r--r--llvm/lib/IR/AsmWriter.cpp6
-rw-r--r--llvm/lib/IR/DebugLoc.cpp6
-rw-r--r--llvm/lib/IR/DebugProgramInstruction.cpp6
-rw-r--r--llvm/lib/IR/FPEnv.cpp16
-rw-r--r--llvm/lib/IR/IRBuilder.cpp3
-rw-r--r--llvm/lib/IR/Instructions.cpp10
-rw-r--r--llvm/lib/IR/Operator.cpp4
-rw-r--r--llvm/lib/IR/PassTimingInfo.cpp12
-rw-r--r--llvm/lib/IR/ProfDataUtils.cpp9
-rw-r--r--llvm/lib/IR/PseudoProbe.cpp12
-rw-r--r--llvm/lib/IR/ReplaceConstant.cpp12
-rw-r--r--llvm/lib/IR/RuntimeLibcalls.cpp85
-rw-r--r--llvm/lib/IR/Use.cpp4
-rw-r--r--llvm/lib/IR/User.cpp5
-rw-r--r--llvm/lib/IR/Value.cpp8
-rw-r--r--llvm/lib/IR/Verifier.cpp6
-rw-r--r--llvm/lib/LTO/LTO.cpp7
-rw-r--r--llvm/lib/MC/SPIRVObjectWriter.cpp7
-rw-r--r--llvm/lib/Object/MachOObjectFile.cpp26
-rw-r--r--llvm/lib/ObjectYAML/ELFYAML.cpp2
-rw-r--r--llvm/lib/Passes/PassBuilder.cpp1
-rw-r--r--llvm/lib/Passes/PassBuilderPipelines.cpp2
-rw-r--r--llvm/lib/Passes/PassRegistry.def1
-rw-r--r--llvm/lib/Support/Mustache.cpp226
-rw-r--r--llvm/lib/TableGen/Record.cpp4
-rw-r--r--llvm/lib/Target/AArch64/AArch64CallingConvention.td6
-rw-r--r--llvm/lib/Target/AArch64/AArch64FastISel.cpp5
-rw-r--r--llvm/lib/Target/AArch64/AArch64FrameLowering.cpp8
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp84
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.h1
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrGISel.td7
-rw-r--r--llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp19
-rw-r--r--llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp63
-rw-r--r--llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h4
-rw-r--r--llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp14
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp168
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h16
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp3
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp27
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp21
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp7
-rw-r--r--llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/R600ISelLowering.cpp9
-rw-r--r--llvm/lib/Target/AMDGPU/SIFoldOperands.cpp29
-rw-r--r--llvm/lib/Target/AMDGPU/SIFrameLowering.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h2
-rw-r--r--llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3PInstructions.td10
-rw-r--r--llvm/lib/Target/ARM/ARMFastISel.cpp2
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp78
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.h1
-rw-r--r--llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp9
-rw-r--r--llvm/lib/Target/BPF/BPFAsmPrinter.cpp14
-rw-r--r--llvm/lib/Target/DirectX/DXContainerGlobals.cpp7
-rw-r--r--llvm/lib/Target/DirectX/DXIL.td9
-rw-r--r--llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp66
-rw-r--r--llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp8
-rw-r--r--llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp7
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp5
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp149
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchInstrInfo.h6
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td31
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp9
-rw-r--r--llvm/lib/Target/Mips/MipsFastISel.cpp2
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXInstrInfo.td2
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrFuture.td3
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrMMA.td4
-rw-r--r--llvm/lib/Target/RISCV/CMakeLists.txt1
-rw-r--r--llvm/lib/Target/RISCV/RISCV.h4
-rw-r--r--llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp2
-rw-r--r--llvm/lib/Target/RISCV/RISCVFeatures.td5
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp171
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.h2
-rw-r--r--llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp7
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfo.cpp4
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td1
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td6
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td6
-rw-r--r--llvm/lib/Target/RISCV/RISCVPromoteConstant.cpp213
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetMachine.cpp3
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp5
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp11
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVBuiltins.td5
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp8
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVInstrInfo.td6
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp4
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp72
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp16
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp11
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td22
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp3
-rw-r--r--llvm/lib/Target/Sparc/Sparc.td12
-rw-r--r--llvm/lib/Target/Sparc/SparcISelLowering.cpp8
-rw-r--r--llvm/lib/Target/Target.cpp1
-rw-r--r--llvm/lib/Target/WebAssembly/CMakeLists.txt1
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp2
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp27
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyISD.def64
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp85
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h12
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td5
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp20
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp24
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h17
-rw-r--r--llvm/lib/Target/X86/X86.h26
-rw-r--r--llvm/lib/Target/X86/X86CompressEVEX.cpp2
-rw-r--r--llvm/lib/Target/X86/X86ISelDAGToDAG.cpp4
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp207
-rw-r--r--llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp48
-rw-r--r--llvm/lib/Target/X86/X86PartialReduction.cpp72
-rw-r--r--llvm/lib/Target/X86/X86PassRegistry.def4
-rw-r--r--llvm/lib/Target/X86/X86TargetMachine.cpp6
-rw-r--r--llvm/lib/Target/X86/X86VZeroUpper.cpp2
-rw-r--r--llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCCodeEmitter.cpp2
-rw-r--r--llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp2
-rw-r--r--llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp21
-rw-r--r--llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp3
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineInternal.h2
-rw-r--r--llvm/lib/Transforms/InstCombine/InstructionCombining.cpp6
-rw-r--r--llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp77
-rw-r--r--llvm/lib/Transforms/Scalar/LoopFuse.cpp16
-rw-r--r--llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp40
-rw-r--r--llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp16
-rw-r--r--llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp12
-rw-r--r--llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp3
-rw-r--r--llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp26
-rw-r--r--llvm/lib/Transforms/Utils/BuildLibCalls.cpp6
-rw-r--r--llvm/lib/Transforms/Utils/CodeExtractor.cpp1
-rw-r--r--llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp52
-rw-r--r--llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp21
-rw-r--r--llvm/lib/Transforms/Utils/LoopVersioning.cpp9
-rw-r--r--llvm/lib/Transforms/Utils/SimplifyCFG.cpp173
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp5
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp7
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.cpp11
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.h94
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp23
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp173
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp2
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanUtils.cpp4
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanValue.h6
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp7
-rw-r--r--llvm/lib/Transforms/Vectorize/VectorCombine.cpp27
184 files changed, 3018 insertions, 1439 deletions
diff --git a/llvm/lib/Analysis/Analysis.cpp b/llvm/lib/Analysis/Analysis.cpp
index 9f5daf3..aaac2cf 100644
--- a/llvm/lib/Analysis/Analysis.cpp
+++ b/llvm/lib/Analysis/Analysis.cpp
@@ -63,6 +63,7 @@ void llvm::initializeAnalysis(PassRegistry &Registry) {
initializeRegionPrinterPass(Registry);
initializeRegionOnlyViewerPass(Registry);
initializeRegionOnlyPrinterPass(Registry);
+ initializeRuntimeLibraryInfoWrapperPass(Registry);
initializeSCEVAAWrapperPassPass(Registry);
initializeScalarEvolutionWrapperPassPass(Registry);
initializeStackSafetyGlobalInfoWrapperPassPass(Registry);
diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt
index 16dd6f8..88ebd65 100644
--- a/llvm/lib/Analysis/CMakeLists.txt
+++ b/llvm/lib/Analysis/CMakeLists.txt
@@ -137,6 +137,7 @@ add_llvm_component_library(LLVMAnalysis
RegionPass.cpp
RegionPrinter.cpp
ReplayInlineAdvisor.cpp
+ RuntimeLibcallInfo.cpp
ScalarEvolution.cpp
ScalarEvolutionAliasAnalysis.cpp
ScalarEvolutionDivision.cpp
diff --git a/llvm/lib/Analysis/DXILMetadataAnalysis.cpp b/llvm/lib/Analysis/DXILMetadataAnalysis.cpp
index 23f1aa8..bd77cba 100644
--- a/llvm/lib/Analysis/DXILMetadataAnalysis.cpp
+++ b/llvm/lib/Analysis/DXILMetadataAnalysis.cpp
@@ -66,6 +66,22 @@ static ModuleMetadataInfo collectMetadataInfo(Module &M) {
Success = llvm::to_integer(NumThreadsVec[2], EFP.NumThreadsZ, 10);
assert(Success && "Failed to parse Z component of numthreads");
}
+ // Get wavesize attribute value, if one exists
+ StringRef WaveSizeStr =
+ F.getFnAttribute("hlsl.wavesize").getValueAsString();
+ if (!WaveSizeStr.empty()) {
+ SmallVector<StringRef> WaveSizeVec;
+ WaveSizeStr.split(WaveSizeVec, ',');
+ assert(WaveSizeVec.size() == 3 && "Invalid wavesize specified");
+ // Read in the three component values of numthreads
+ [[maybe_unused]] bool Success =
+ llvm::to_integer(WaveSizeVec[0], EFP.WaveSizeMin, 10);
+ assert(Success && "Failed to parse Min component of wavesize");
+ Success = llvm::to_integer(WaveSizeVec[1], EFP.WaveSizeMax, 10);
+ assert(Success && "Failed to parse Max component of wavesize");
+ Success = llvm::to_integer(WaveSizeVec[2], EFP.WaveSizePref, 10);
+ assert(Success && "Failed to parse Preferred component of wavesize");
+ }
MMDAI.EntryPropertyVec.push_back(EFP);
}
return MMDAI;
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index e27a9b1..5d88e5f 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -806,11 +806,11 @@ public:
typedef SmallVector<MemAccessInfo, 8> MemAccessInfoList;
AccessAnalysis(const Loop *TheLoop, AAResults *AA, const LoopInfo *LI,
- MemoryDepChecker::DepCandidates &DA,
+ DominatorTree &DT, MemoryDepChecker::DepCandidates &DA,
PredicatedScalarEvolution &PSE,
SmallPtrSetImpl<MDNode *> &LoopAliasScopes)
- : TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DepCands(DA), PSE(PSE),
- LoopAliasScopes(LoopAliasScopes) {
+ : TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DT(DT), DepCands(DA),
+ PSE(PSE), LoopAliasScopes(LoopAliasScopes) {
// We're analyzing dependences across loop iterations.
BAA.enableCrossIterationMode();
}
@@ -934,6 +934,9 @@ private:
/// The LoopInfo of the loop being checked.
const LoopInfo *LI;
+ /// The dominator tree of the function.
+ DominatorTree &DT;
+
/// Sets of potentially dependent accesses - members of one set share an
/// underlying pointer. The set "CheckDeps" identfies which sets really need a
/// dependence check.
@@ -1015,6 +1018,7 @@ getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy,
/// informating from the IR pointer value to determine no-wrap.
static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR,
Value *Ptr, Type *AccessTy, const Loop *L, bool Assume,
+ const DominatorTree &DT,
std::optional<int64_t> Stride = std::nullopt) {
// FIXME: This should probably only return true for NUW.
if (AR->getNoWrapFlags(SCEV::NoWrapMask))
@@ -1029,8 +1033,18 @@ static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR,
// case, the GEP would be poison and any memory access dependent on it would
// be immediate UB when executed.
if (auto *GEP = dyn_cast_if_present<GetElementPtrInst>(Ptr);
- GEP && GEP->hasNoUnsignedSignedWrap())
- return true;
+ GEP && GEP->hasNoUnsignedSignedWrap()) {
+ // For the above reasoning to apply, the pointer must be dereferenced in
+ // every iteration.
+ if (L->getHeader() == L->getLoopLatch() ||
+ any_of(GEP->users(), [L, &DT, GEP](User *U) {
+ if (getLoadStorePointerOperand(U) != GEP)
+ return false;
+ BasicBlock *UserBB = cast<Instruction>(U)->getParent();
+ return !LoopAccessInfo::blockNeedsPredication(UserBB, L, &DT);
+ }))
+ return true;
+ }
if (!Stride)
Stride = getStrideFromAddRec(AR, L, AccessTy, Ptr, PSE);
@@ -1293,7 +1307,7 @@ bool AccessAnalysis::createCheckForAccess(
}
if (!isNoWrap(PSE, AR, RTCheckPtrs.size() == 1 ? Ptr : nullptr, AccessTy,
- TheLoop, Assume))
+ TheLoop, Assume, DT))
return false;
}
@@ -1606,7 +1620,7 @@ void AccessAnalysis::processMemAccesses() {
/// Check whether the access through \p Ptr has a constant stride.
std::optional<int64_t>
llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr,
- const Loop *Lp,
+ const Loop *Lp, const DominatorTree &DT,
const DenseMap<Value *, const SCEV *> &StridesMap,
bool Assume, bool ShouldCheckWrap) {
const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr);
@@ -1630,7 +1644,7 @@ llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr,
if (!ShouldCheckWrap || !Stride)
return Stride;
- if (isNoWrap(PSE, AR, Ptr, AccessTy, Lp, Assume, Stride))
+ if (isNoWrap(PSE, AR, Ptr, AccessTy, Lp, Assume, DT, Stride))
return Stride;
LLVM_DEBUG(
@@ -2047,10 +2061,10 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
BPtr->getType()->getPointerAddressSpace())
return MemoryDepChecker::Dependence::Unknown;
- std::optional<int64_t> StrideAPtr =
- getPtrStride(PSE, ATy, APtr, InnermostLoop, SymbolicStrides, true, true);
- std::optional<int64_t> StrideBPtr =
- getPtrStride(PSE, BTy, BPtr, InnermostLoop, SymbolicStrides, true, true);
+ std::optional<int64_t> StrideAPtr = getPtrStride(
+ PSE, ATy, APtr, InnermostLoop, *DT, SymbolicStrides, true, true);
+ std::optional<int64_t> StrideBPtr = getPtrStride(
+ PSE, BTy, BPtr, InnermostLoop, *DT, SymbolicStrides, true, true);
const SCEV *Src = PSE.getSCEV(APtr);
const SCEV *Sink = PSE.getSCEV(BPtr);
@@ -2627,7 +2641,8 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI,
}
MemoryDepChecker::DepCandidates DepCands;
- AccessAnalysis Accesses(TheLoop, AA, LI, DepCands, *PSE, LoopAliasScopes);
+ AccessAnalysis Accesses(TheLoop, AA, LI, *DT, DepCands, *PSE,
+ LoopAliasScopes);
// Holds the analyzed pointers. We don't want to call getUnderlyingObjects
// multiple times on the same object. If the ptr is accessed twice, once
@@ -2691,7 +2706,8 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI,
bool IsReadOnlyPtr = false;
Type *AccessTy = getLoadStoreType(LD);
if (Seen.insert({Ptr, AccessTy}).second ||
- !getPtrStride(*PSE, AccessTy, Ptr, TheLoop, SymbolicStrides)) {
+ !getPtrStride(*PSE, AccessTy, Ptr, TheLoop, *DT, SymbolicStrides, false,
+ true)) {
++NumReads;
IsReadOnlyPtr = true;
}
diff --git a/llvm/lib/Analysis/RuntimeLibcallInfo.cpp b/llvm/lib/Analysis/RuntimeLibcallInfo.cpp
new file mode 100644
index 0000000..6fb4119
--- /dev/null
+++ b/llvm/lib/Analysis/RuntimeLibcallInfo.cpp
@@ -0,0 +1,43 @@
+//===- RuntimeLibcallInfo.cpp ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/RuntimeLibcallInfo.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+AnalysisKey RuntimeLibraryAnalysis::Key;
+
+RTLIB::RuntimeLibcallsInfo
+RuntimeLibraryAnalysis::run(const Module &M, ModuleAnalysisManager &) {
+ return RTLIB::RuntimeLibcallsInfo(M);
+}
+
+INITIALIZE_PASS(RuntimeLibraryInfoWrapper, "runtime-library-info",
+ "Runtime Library Function Analysis", false, true)
+
+RuntimeLibraryInfoWrapper::RuntimeLibraryInfoWrapper()
+ : ImmutablePass(ID), RTLA(RTLIB::RuntimeLibcallsInfo(Triple())) {}
+
+char RuntimeLibraryInfoWrapper::ID = 0;
+
+ModulePass *llvm::createRuntimeLibraryInfoWrapperPass() {
+ return new RuntimeLibraryInfoWrapper();
+}
+
+void RuntimeLibraryInfoWrapper::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.setPreservesAll();
+}
+
+// Assume this is stable unless explicitly invalidated.
+bool RTLIB::RuntimeLibcallsInfo::invalidate(
+ Module &M, const PreservedAnalyses &PA,
+ ModuleAnalysisManager::Invalidator &) {
+ auto PAC = PA.getChecker<RuntimeLibraryAnalysis>();
+ return !PAC.preservedWhenStateless();
+}
diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp
index 813632c..74f3a7d 100644
--- a/llvm/lib/Analysis/TargetLibraryInfo.cpp
+++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp
@@ -388,6 +388,10 @@ static void initializeLibCalls(TargetLibraryInfoImpl &TLI, const Triple &T,
TLI.setAvailableWithName(LibFunc_logbf, "_logbf");
else
TLI.setUnavailable(LibFunc_logbf);
+ TLI.setUnavailable(LibFunc_nextafter);
+ TLI.setUnavailable(LibFunc_nextafterf);
+ TLI.setUnavailable(LibFunc_nexttoward);
+ TLI.setUnavailable(LibFunc_nexttowardf);
TLI.setUnavailable(LibFunc_rint);
TLI.setUnavailable(LibFunc_rintf);
TLI.setUnavailable(LibFunc_round);
@@ -418,6 +422,8 @@ static void initializeLibCalls(TargetLibraryInfoImpl &TLI, const Triple &T,
TLI.setUnavailable(LibFunc_logbl);
TLI.setUnavailable(LibFunc_ilogbl);
TLI.setUnavailable(LibFunc_nearbyintl);
+ TLI.setUnavailable(LibFunc_nextafterl);
+ TLI.setUnavailable(LibFunc_nexttowardl);
TLI.setUnavailable(LibFunc_rintl);
TLI.setUnavailable(LibFunc_roundl);
TLI.setUnavailable(LibFunc_scalblnl);
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 0a72076..789a983 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -7419,84 +7419,20 @@ static bool canCreateUndefOrPoison(const Operator *Op, UndefPoisonKind Kind,
if (cast<ConstantInt>(II->getArgOperand(1))->isNullValue())
return false;
break;
- case Intrinsic::ctpop:
- case Intrinsic::bswap:
- case Intrinsic::bitreverse:
- case Intrinsic::fshl:
- case Intrinsic::fshr:
- case Intrinsic::smax:
- case Intrinsic::smin:
- case Intrinsic::scmp:
- case Intrinsic::umax:
- case Intrinsic::umin:
- case Intrinsic::ucmp:
- case Intrinsic::ptrmask:
- case Intrinsic::fptoui_sat:
- case Intrinsic::fptosi_sat:
- case Intrinsic::sadd_with_overflow:
- case Intrinsic::ssub_with_overflow:
- case Intrinsic::smul_with_overflow:
- case Intrinsic::uadd_with_overflow:
- case Intrinsic::usub_with_overflow:
- case Intrinsic::umul_with_overflow:
- case Intrinsic::sadd_sat:
- case Intrinsic::uadd_sat:
- case Intrinsic::ssub_sat:
- case Intrinsic::usub_sat:
- return false;
case Intrinsic::sshl_sat:
case Intrinsic::ushl_sat:
- return includesPoison(Kind) &&
- !shiftAmountKnownInRange(II->getArgOperand(1));
- case Intrinsic::fma:
- case Intrinsic::fmuladd:
- case Intrinsic::sqrt:
- case Intrinsic::powi:
- case Intrinsic::sin:
- case Intrinsic::cos:
- case Intrinsic::pow:
- case Intrinsic::log:
- case Intrinsic::log10:
- case Intrinsic::log2:
- case Intrinsic::exp:
- case Intrinsic::exp2:
- case Intrinsic::exp10:
- case Intrinsic::fabs:
- case Intrinsic::copysign:
- case Intrinsic::floor:
- case Intrinsic::ceil:
- case Intrinsic::trunc:
- case Intrinsic::rint:
- case Intrinsic::nearbyint:
- case Intrinsic::round:
- case Intrinsic::roundeven:
- case Intrinsic::fptrunc_round:
- case Intrinsic::canonicalize:
- case Intrinsic::arithmetic_fence:
- case Intrinsic::minnum:
- case Intrinsic::maxnum:
- case Intrinsic::minimum:
- case Intrinsic::maximum:
- case Intrinsic::minimumnum:
- case Intrinsic::maximumnum:
- case Intrinsic::is_fpclass:
- case Intrinsic::ldexp:
- case Intrinsic::frexp:
- return false;
- case Intrinsic::lround:
- case Intrinsic::llround:
- case Intrinsic::lrint:
- case Intrinsic::llrint:
- // If the value doesn't fit an unspecified value is returned (but this
- // is not poison).
- return false;
+ if (!includesPoison(Kind) ||
+ shiftAmountKnownInRange(II->getArgOperand(1)))
+ return false;
+ break;
}
}
[[fallthrough]];
case Instruction::CallBr:
case Instruction::Invoke: {
const auto *CB = cast<CallBase>(Op);
- return !CB->hasRetAttr(Attribute::NoUndef);
+ return !CB->hasRetAttr(Attribute::NoUndef) &&
+ !CB->hasFnAttr(Attribute::NoCreateUndefOrPoison);
}
case Instruction::InsertElement:
case Instruction::ExtractElement: {
@@ -10405,3 +10341,55 @@ const Value *llvm::stripNullTest(const Value *V) {
Value *llvm::stripNullTest(Value *V) {
return const_cast<Value *>(stripNullTest(const_cast<const Value *>(V)));
}
+
+bool llvm::collectPossibleValues(const Value *V,
+ SmallPtrSetImpl<const Constant *> &Constants,
+ unsigned MaxCount, bool AllowUndefOrPoison) {
+ SmallPtrSet<const Instruction *, 8> Visited;
+ SmallVector<const Instruction *, 8> Worklist;
+ auto Push = [&](const Value *V) -> bool {
+ if (auto *C = dyn_cast<Constant>(V)) {
+ if (!AllowUndefOrPoison && !isGuaranteedNotToBeUndefOrPoison(C))
+ return false;
+ // Check existence first to avoid unnecessary allocations.
+ if (Constants.contains(C))
+ return true;
+ if (Constants.size() == MaxCount)
+ return false;
+ Constants.insert(C);
+ return true;
+ }
+
+ if (auto *Inst = dyn_cast<Instruction>(V)) {
+ if (Visited.insert(Inst).second)
+ Worklist.push_back(Inst);
+ return true;
+ }
+ return false;
+ };
+ if (!Push(V))
+ return false;
+ while (!Worklist.empty()) {
+ const Instruction *CurInst = Worklist.pop_back_val();
+ switch (CurInst->getOpcode()) {
+ case Instruction::Select:
+ if (!Push(CurInst->getOperand(1)))
+ return false;
+ if (!Push(CurInst->getOperand(2)))
+ return false;
+ break;
+ case Instruction::PHI:
+ for (Value *IncomingValue : cast<PHINode>(CurInst)->incoming_values()) {
+ // Fast path for recurrence PHI.
+ if (IncomingValue == CurInst)
+ continue;
+ if (!Push(IncomingValue))
+ return false;
+ }
+ break;
+ default:
+ return false;
+ }
+ }
+ return true;
+}
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 091d948..977ed59 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -1387,9 +1387,9 @@ void InterleavedAccessInfo::collectConstStrideAccesses(
// wrap around the address space we would do a memory access at nullptr
// even without the transformation. The wrapping checks are therefore
// deferred until after we've formed the interleaved groups.
- int64_t Stride =
- getPtrStride(PSE, ElementTy, Ptr, TheLoop, Strides,
- /*Assume=*/true, /*ShouldCheckWrap=*/false).value_or(0);
+ int64_t Stride = getPtrStride(PSE, ElementTy, Ptr, TheLoop, *DT, Strides,
+ /*Assume=*/true, /*ShouldCheckWrap=*/false)
+ .value_or(0);
const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
AccessStrideInfo[&I] = StrideDescriptor(Stride, Scev, Size,
@@ -1643,8 +1643,9 @@ void InterleavedAccessInfo::analyzeInterleaving(
assert(Member && "Group member does not exist");
Value *MemberPtr = getLoadStorePointerOperand(Member);
Type *AccessTy = getLoadStoreType(Member);
- if (getPtrStride(PSE, AccessTy, MemberPtr, TheLoop, Strides,
- /*Assume=*/false, /*ShouldCheckWrap=*/true).value_or(0))
+ if (getPtrStride(PSE, AccessTy, MemberPtr, TheLoop, *DT, Strides,
+ /*Assume=*/false, /*ShouldCheckWrap=*/true)
+ .value_or(0))
return false;
LLVM_DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
<< FirstOrLast
diff --git a/llvm/lib/BinaryFormat/CMakeLists.txt b/llvm/lib/BinaryFormat/CMakeLists.txt
index 4b2debb..0c8af1e 100644
--- a/llvm/lib/BinaryFormat/CMakeLists.txt
+++ b/llvm/lib/BinaryFormat/CMakeLists.txt
@@ -6,7 +6,6 @@ add_llvm_component_library(LLVMBinaryFormat
ELF.cpp
MachO.cpp
Magic.cpp
- Minidump.cpp
MsgPackDocument.cpp
MsgPackDocumentYAML.cpp
MsgPackReader.cpp
diff --git a/llvm/lib/BinaryFormat/Minidump.cpp b/llvm/lib/BinaryFormat/Minidump.cpp
deleted file mode 100644
index b618fb1..0000000
--- a/llvm/lib/BinaryFormat/Minidump.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-//===-- Minidump.cpp - Minidump constants and structures ---------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/BinaryFormat/Minidump.h"
-
-using namespace llvm::minidump;
-
-constexpr uint32_t Header::MagicSignature;
-constexpr uint16_t Header::MagicVersion;
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 466dcb0..8930d64 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -2257,6 +2257,8 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) {
return Attribute::Captures;
case bitc::ATTR_KIND_DEAD_ON_RETURN:
return Attribute::DeadOnReturn;
+ case bitc::ATTR_KIND_NO_CREATE_UNDEF_OR_POISON:
+ return Attribute::NoCreateUndefOrPoison;
}
}
@@ -8566,16 +8568,13 @@ Expected<std::unique_ptr<ModuleSummaryIndex>> BitcodeModule::getSummary() {
}
static Expected<std::pair<bool, bool>>
-getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream,
- unsigned ID,
- BitcodeLTOInfo &LTOInfo) {
+getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream, unsigned ID) {
if (Error Err = Stream.EnterSubBlock(ID))
return std::move(Err);
- SmallVector<uint64_t, 64> Record;
+ SmallVector<uint64_t, 64> Record;
while (true) {
BitstreamEntry Entry;
- std::pair<bool, bool> Result = {false,false};
if (Error E = Stream.advanceSkippingSubblocks().moveInto(Entry))
return std::move(E);
@@ -8584,8 +8583,8 @@ getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream,
case BitstreamEntry::Error:
return error("Malformed block");
case BitstreamEntry::EndBlock: {
- // If no flags record found, set both flags to false.
- return Result;
+ // If no flags record found, return both flags as false.
+ return std::make_pair(false, false);
}
case BitstreamEntry::Record:
// The interesting case.
@@ -8607,9 +8606,7 @@ getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream,
bool EnableSplitLTOUnit = Flags & 0x8;
bool UnifiedLTO = Flags & 0x200;
- Result = {EnableSplitLTOUnit, UnifiedLTO};
-
- return Result;
+ return std::make_pair(EnableSplitLTOUnit, UnifiedLTO);
}
}
}
@@ -8638,26 +8635,15 @@ Expected<BitcodeLTOInfo> BitcodeModule::getLTOInfo() {
/*EnableSplitLTOUnit=*/false, /*UnifiedLTO=*/false};
case BitstreamEntry::SubBlock:
- if (Entry.ID == bitc::GLOBALVAL_SUMMARY_BLOCK_ID) {
- BitcodeLTOInfo LTOInfo;
+ if (Entry.ID == bitc::GLOBALVAL_SUMMARY_BLOCK_ID ||
+ Entry.ID == bitc::FULL_LTO_GLOBALVAL_SUMMARY_BLOCK_ID) {
Expected<std::pair<bool, bool>> Flags =
- getEnableSplitLTOUnitAndUnifiedFlag(Stream, Entry.ID, LTOInfo);
+ getEnableSplitLTOUnitAndUnifiedFlag(Stream, Entry.ID);
if (!Flags)
return Flags.takeError();
- std::tie(LTOInfo.EnableSplitLTOUnit, LTOInfo.UnifiedLTO) = Flags.get();
- LTOInfo.IsThinLTO = true;
- LTOInfo.HasSummary = true;
- return LTOInfo;
- }
-
- if (Entry.ID == bitc::FULL_LTO_GLOBALVAL_SUMMARY_BLOCK_ID) {
BitcodeLTOInfo LTOInfo;
- Expected<std::pair<bool, bool>> Flags =
- getEnableSplitLTOUnitAndUnifiedFlag(Stream, Entry.ID, LTOInfo);
- if (!Flags)
- return Flags.takeError();
std::tie(LTOInfo.EnableSplitLTOUnit, LTOInfo.UnifiedLTO) = Flags.get();
- LTOInfo.IsThinLTO = false;
+ LTOInfo.IsThinLTO = (Entry.ID == bitc::GLOBALVAL_SUMMARY_BLOCK_ID);
LTOInfo.HasSummary = true;
return LTOInfo;
}
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index f17656c..76494c7 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -956,6 +956,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
return bitc::ATTR_KIND_CAPTURES;
case Attribute::DeadOnReturn:
return bitc::ATTR_KIND_DEAD_ON_RETURN;
+ case Attribute::NoCreateUndefOrPoison:
+ return bitc::ATTR_KIND_NO_CREATE_UNDEF_OR_POISON;
case Attribute::EndAttrKinds:
llvm_unreachable("Can not encode end-attribute kinds marker.");
case Attribute::None:
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index f65d88a..713277d 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -1708,7 +1708,6 @@ void AsmPrinter::emitCallGraphSection(const MachineFunction &MF,
OutStreamer->pushSection();
OutStreamer->switchSection(FuncCGSection);
- const MCSymbol *FunctionSymbol = getFunctionBegin();
const Function &F = MF.getFunction();
// If this function has external linkage or has its address taken and
// it is not a callback, then anything could call it.
@@ -1747,7 +1746,7 @@ void AsmPrinter::emitCallGraphSection(const MachineFunction &MF,
// 8) Each unique indirect target type id.
OutStreamer->emitInt8(CallGraphSectionFormatVersion::V_0);
OutStreamer->emitInt8(static_cast<uint8_t>(CGFlags));
- OutStreamer->emitSymbolValue(FunctionSymbol, TM.getProgramPointerSize());
+ OutStreamer->emitSymbolValue(getSymbol(&F), TM.getProgramPointerSize());
const auto *TypeId = extractNumericCGTypeId(F);
if (IsIndirectTarget && TypeId)
OutStreamer->emitInt64(TypeId->getZExtValue());
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 53f1cfe2..d9bc042 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -38,6 +38,7 @@
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
#include "llvm/IR/Module.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
@@ -1259,8 +1260,7 @@ Value *AtomicExpandImpl::insertRMWLLSCLoop(
BasicBlock *BB = Builder.GetInsertBlock();
Function *F = BB->getParent();
- assert(AddrAlign >=
- F->getDataLayout().getTypeStoreSize(ResultTy) &&
+ assert(AddrAlign >= F->getDataLayout().getTypeStoreSize(ResultTy) &&
"Expected at least natural alignment at this point.");
// Given: atomicrmw some_op iN* %addr, iN %incr ordering
@@ -1295,7 +1295,13 @@ Value *AtomicExpandImpl::insertRMWLLSCLoop(
TLI->emitStoreConditional(Builder, NewVal, Addr, MemOpOrder);
Value *TryAgain = Builder.CreateICmpNE(
StoreSuccess, ConstantInt::get(IntegerType::get(Ctx, 32), 0), "tryagain");
- Builder.CreateCondBr(TryAgain, LoopBB, ExitBB);
+
+ Instruction *CondBr = Builder.CreateCondBr(TryAgain, LoopBB, ExitBB);
+
+ // Atomic RMW expands to a Load-linked / Store-Conditional loop, because it is
+ // hard to predict precise branch weigths we mark the branch as "unknown"
+ // (50/50) to prevent misleading optimizations.
+ setExplicitlyUnknownBranchWeightsIfProfiled(*CondBr, DEBUG_TYPE);
Builder.SetInsertPoint(ExitBB, ExitBB->begin());
return Loaded;
@@ -1680,7 +1686,12 @@ Value *AtomicExpandImpl::insertRMWCmpXchgLoop(
Loaded->addIncoming(NewLoaded, LoopBB);
- Builder.CreateCondBr(Success, ExitBB, LoopBB);
+ Instruction *CondBr = Builder.CreateCondBr(Success, ExitBB, LoopBB);
+
+ // Atomic RMW expands to a cmpxchg loop, Since precise branch weights
+ // cannot be easily determined here, we mark the branch as "unknown" (50/50)
+ // to prevent misleading optimizations.
+ setExplicitlyUnknownBranchWeightsIfProfiled(*CondBr, DEBUG_TYPE);
Builder.SetInsertPoint(ExitBB, ExitBB->begin());
return NewLoaded;
diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt
index 4373c53..1cf0b49 100644
--- a/llvm/lib/CodeGen/CMakeLists.txt
+++ b/llvm/lib/CodeGen/CMakeLists.txt
@@ -88,6 +88,7 @@ add_llvm_component_library(LLVMCodeGen
LatencyPriorityQueue.cpp
LazyMachineBlockFrequencyInfo.cpp
LexicalScopes.cpp
+ LibcallLoweringInfo.cpp
LiveDebugVariables.cpp
LiveIntervals.cpp
LiveInterval.cpp
diff --git a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
index d6f23b6..c1fb8b6 100644
--- a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
@@ -643,6 +643,38 @@ void GISelValueTracking::computeKnownBitsImpl(Register R, KnownBits &Known,
Known.Zero.setBitsFrom(LowBits);
break;
}
+ case TargetOpcode::G_EXTRACT_VECTOR_ELT: {
+ GExtractVectorElement &Extract = cast<GExtractVectorElement>(MI);
+ Register InVec = Extract.getVectorReg();
+ Register EltNo = Extract.getIndexReg();
+
+ auto ConstEltNo = getIConstantVRegVal(EltNo, MRI);
+
+ LLT VecVT = MRI.getType(InVec);
+ // computeKnownBits not yet implemented for scalable vectors.
+ if (VecVT.isScalableVector())
+ break;
+
+ const unsigned EltBitWidth = VecVT.getScalarSizeInBits();
+ const unsigned NumSrcElts = VecVT.getNumElements();
+ // A return type different from the vector's element type may lead to
+ // issues with pattern selection. Bail out to avoid that.
+ if (BitWidth > EltBitWidth)
+ break;
+
+ Known.Zero.setAllBits();
+ Known.One.setAllBits();
+
+ // If we know the element index, just demand that vector element, else for
+ // an unknown element index, ignore DemandedElts and demand them all.
+ APInt DemandedSrcElts = APInt::getAllOnes(NumSrcElts);
+ if (ConstEltNo && ConstEltNo->ult(NumSrcElts))
+ DemandedSrcElts =
+ APInt::getOneBitSet(NumSrcElts, ConstEltNo->getZExtValue());
+
+ computeKnownBitsImpl(InVec, Known, DemandedSrcElts, Depth + 1);
+ break;
+ }
case TargetOpcode::G_SHUFFLE_VECTOR: {
APInt DemandedLHS, DemandedRHS;
// Collect the known bits that are shared by every vector element referenced
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 1fc90d0..be1b51f 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -294,6 +294,10 @@ void IRTranslator::addMachineCFGPred(CFGEdge Edge, MachineBasicBlock *NewPred) {
MachinePreds[Edge].push_back(NewPred);
}
+static bool targetSupportsBF16Type(const MachineFunction *MF) {
+ return MF->getTarget().getTargetTriple().isSPIRV();
+}
+
static bool containsBF16Type(const User &U) {
// BF16 cannot currently be represented by LLT, to avoid miscompiles we
// prevent any instructions using them. FIXME: This can be removed once LLT
@@ -306,7 +310,7 @@ static bool containsBF16Type(const User &U) {
bool IRTranslator::translateBinaryOp(unsigned Opcode, const User &U,
MachineIRBuilder &MIRBuilder) {
- if (containsBF16Type(U))
+ if (containsBF16Type(U) && !targetSupportsBF16Type(MF))
return false;
// Get or create a virtual register for each value.
@@ -328,7 +332,7 @@ bool IRTranslator::translateBinaryOp(unsigned Opcode, const User &U,
bool IRTranslator::translateUnaryOp(unsigned Opcode, const User &U,
MachineIRBuilder &MIRBuilder) {
- if (containsBF16Type(U))
+ if (containsBF16Type(U) && !targetSupportsBF16Type(MF))
return false;
Register Op0 = getOrCreateVReg(*U.getOperand(0));
@@ -348,7 +352,7 @@ bool IRTranslator::translateFNeg(const User &U, MachineIRBuilder &MIRBuilder) {
bool IRTranslator::translateCompare(const User &U,
MachineIRBuilder &MIRBuilder) {
- if (containsBF16Type(U))
+ if (containsBF16Type(U) && !targetSupportsBF16Type(MF))
return false;
auto *CI = cast<CmpInst>(&U);
@@ -1569,7 +1573,7 @@ bool IRTranslator::translateBitCast(const User &U,
bool IRTranslator::translateCast(unsigned Opcode, const User &U,
MachineIRBuilder &MIRBuilder) {
- if (containsBF16Type(U))
+ if (containsBF16Type(U) && !targetSupportsBF16Type(MF))
return false;
uint32_t Flags = 0;
@@ -2688,7 +2692,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
bool IRTranslator::translateInlineAsm(const CallBase &CB,
MachineIRBuilder &MIRBuilder) {
- if (containsBF16Type(CB))
+ if (containsBF16Type(CB) && !targetSupportsBF16Type(MF))
return false;
const InlineAsmLowering *ALI = MF->getSubtarget().getInlineAsmLowering();
@@ -2779,7 +2783,7 @@ bool IRTranslator::translateCallBase(const CallBase &CB,
}
bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
- if (!MF->getTarget().getTargetTriple().isSPIRV() && containsBF16Type(U))
+ if (containsBF16Type(U) && !targetSupportsBF16Type(MF))
return false;
const CallInst &CI = cast<CallInst>(U);
@@ -2817,20 +2821,34 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
if (translateKnownIntrinsic(CI, ID, MIRBuilder))
return true;
+ TargetLowering::IntrinsicInfo Info;
+ bool IsTgtMemIntrinsic = TLI->getTgtMemIntrinsic(Info, CI, *MF, ID);
+
+ return translateIntrinsic(CI, ID, MIRBuilder,
+ IsTgtMemIntrinsic ? &Info : nullptr);
+}
+
+/// Translate a call to an intrinsic.
+/// Depending on whether TLI->getTgtMemIntrinsic() is true, TgtMemIntrinsicInfo
+/// is a pointer to the correspondingly populated IntrinsicInfo object.
+/// Otherwise, this pointer is null.
+bool IRTranslator::translateIntrinsic(
+ const CallBase &CB, Intrinsic::ID ID, MachineIRBuilder &MIRBuilder,
+ const TargetLowering::IntrinsicInfo *TgtMemIntrinsicInfo) {
ArrayRef<Register> ResultRegs;
- if (!CI.getType()->isVoidTy())
- ResultRegs = getOrCreateVRegs(CI);
+ if (!CB.getType()->isVoidTy())
+ ResultRegs = getOrCreateVRegs(CB);
// Ignore the callsite attributes. Backend code is most likely not expecting
// an intrinsic to sometimes have side effects and sometimes not.
MachineInstrBuilder MIB = MIRBuilder.buildIntrinsic(ID, ResultRegs);
- if (isa<FPMathOperator>(CI))
- MIB->copyIRFlags(CI);
+ if (isa<FPMathOperator>(CB))
+ MIB->copyIRFlags(CB);
- for (const auto &Arg : enumerate(CI.args())) {
+ for (const auto &Arg : enumerate(CB.args())) {
// If this is required to be an immediate, don't materialize it in a
// register.
- if (CI.paramHasAttr(Arg.index(), Attribute::ImmArg)) {
+ if (CB.paramHasAttr(Arg.index(), Attribute::ImmArg)) {
if (ConstantInt *CI = dyn_cast<ConstantInt>(Arg.value())) {
// imm arguments are more convenient than cimm (and realistically
// probably sufficient), so use them.
@@ -2859,29 +2877,33 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
}
// Add a MachineMemOperand if it is a target mem intrinsic.
- TargetLowering::IntrinsicInfo Info;
- // TODO: Add a GlobalISel version of getTgtMemIntrinsic.
- if (TLI->getTgtMemIntrinsic(Info, CI, *MF, ID)) {
- Align Alignment = Info.align.value_or(
- DL->getABITypeAlign(Info.memVT.getTypeForEVT(F->getContext())));
- LLT MemTy = Info.memVT.isSimple()
- ? getLLTForMVT(Info.memVT.getSimpleVT())
- : LLT::scalar(Info.memVT.getStoreSizeInBits());
+ if (TgtMemIntrinsicInfo) {
+ const Function *F = CB.getCalledFunction();
+
+ Align Alignment = TgtMemIntrinsicInfo->align.value_or(DL->getABITypeAlign(
+ TgtMemIntrinsicInfo->memVT.getTypeForEVT(F->getContext())));
+ LLT MemTy =
+ TgtMemIntrinsicInfo->memVT.isSimple()
+ ? getLLTForMVT(TgtMemIntrinsicInfo->memVT.getSimpleVT())
+ : LLT::scalar(TgtMemIntrinsicInfo->memVT.getStoreSizeInBits());
// TODO: We currently just fallback to address space 0 if getTgtMemIntrinsic
// didn't yield anything useful.
MachinePointerInfo MPI;
- if (Info.ptrVal)
- MPI = MachinePointerInfo(Info.ptrVal, Info.offset);
- else if (Info.fallbackAddressSpace)
- MPI = MachinePointerInfo(*Info.fallbackAddressSpace);
+ if (TgtMemIntrinsicInfo->ptrVal) {
+ MPI = MachinePointerInfo(TgtMemIntrinsicInfo->ptrVal,
+ TgtMemIntrinsicInfo->offset);
+ } else if (TgtMemIntrinsicInfo->fallbackAddressSpace) {
+ MPI = MachinePointerInfo(*TgtMemIntrinsicInfo->fallbackAddressSpace);
+ }
MIB.addMemOperand(MF->getMachineMemOperand(
- MPI, Info.flags, MemTy, Alignment, CI.getAAMetadata(),
- /*Ranges=*/nullptr, Info.ssid, Info.order, Info.failureOrder));
+ MPI, TgtMemIntrinsicInfo->flags, MemTy, Alignment, CB.getAAMetadata(),
+ /*Ranges=*/nullptr, TgtMemIntrinsicInfo->ssid,
+ TgtMemIntrinsicInfo->order, TgtMemIntrinsicInfo->failureOrder));
}
- if (CI.isConvergent()) {
- if (auto Bundle = CI.getOperandBundle(LLVMContext::OB_convergencectrl)) {
+ if (CB.isConvergent()) {
+ if (auto Bundle = CB.getOperandBundle(LLVMContext::OB_convergencectrl)) {
auto *Token = Bundle->Inputs[0].get();
Register TokenReg = getOrCreateVReg(*Token);
MIB.addUse(TokenReg, RegState::Implicit);
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 52c43a4..d02f097 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -776,7 +776,7 @@ llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
break;
case TargetOpcode::G_MEMCPY:
RTLibcall = RTLIB::MEMCPY;
- Name = TLI.getMemcpyName();
+ Name = TLI.getLibcallImplName(TLI.getMemcpyImpl()).data();
Args[0].Flags[0].setReturned();
break;
case TargetOpcode::G_MEMMOVE:
diff --git a/llvm/lib/CodeGen/LibcallLoweringInfo.cpp b/llvm/lib/CodeGen/LibcallLoweringInfo.cpp
new file mode 100644
index 0000000..5c1698c
--- /dev/null
+++ b/llvm/lib/CodeGen/LibcallLoweringInfo.cpp
@@ -0,0 +1,26 @@
+//===- LibcallLoweringInfo.cpp - Interface for runtime libcalls -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LibcallLoweringInfo.h"
+
+using namespace llvm;
+
+LibcallLoweringInfo::LibcallLoweringInfo(
+ const RTLIB::RuntimeLibcallsInfo &RTLCI)
+ : RTLCI(RTLCI) {
+ // TODO: This should be generated with lowering predicates, and assert the
+ // call is available.
+ for (RTLIB::LibcallImpl Impl : RTLIB::libcall_impls()) {
+ if (RTLCI.isAvailable(Impl)) {
+ RTLIB::Libcall LC = RTLIB::RuntimeLibcallsInfo::getLibcallFromImpl(Impl);
+ // FIXME: Hack, assume the first available libcall wins.
+ if (LibcallImpls[LC] == RTLIB::Unsupported)
+ LibcallImpls[LC] = Impl;
+ }
+ }
+}
diff --git a/llvm/lib/CodeGen/MachineFunctionSplitter.cpp b/llvm/lib/CodeGen/MachineFunctionSplitter.cpp
index c31454a..b5d3092 100644
--- a/llvm/lib/CodeGen/MachineFunctionSplitter.cpp
+++ b/llvm/lib/CodeGen/MachineFunctionSplitter.cpp
@@ -129,6 +129,9 @@ static bool isColdBlock(const MachineBasicBlock &MBB,
}
bool MachineFunctionSplitter::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(MF.getFunction()))
+ return false;
+
// Do not split functions when -basic-block-sections=all is specified.
if (MF.getTarget().getBBSectionsType() == llvm::BasicBlockSection::All)
return false;
diff --git a/llvm/lib/CodeGen/MachineInstrBundle.cpp b/llvm/lib/CodeGen/MachineInstrBundle.cpp
index da29ffc..f4c1a8b 100644
--- a/llvm/lib/CodeGen/MachineInstrBundle.cpp
+++ b/llvm/lib/CodeGen/MachineInstrBundle.cpp
@@ -83,15 +83,21 @@ llvm::createUnpackMachineBundles(
return new UnpackMachineBundles(std::move(Ftor));
}
-/// Return the first found DebugLoc that has a DILocation, given a range of
-/// instructions. The search range is from FirstMI to LastMI (exclusive). If no
-/// DILocation is found, then an empty location is returned.
+/// Return the first DebugLoc that has line number information, given a
+/// range of instructions. The search range is from FirstMI to LastMI
+/// (exclusive). Otherwise return the first DILocation or an empty location if
+/// there are none.
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI,
MachineBasicBlock::instr_iterator LastMI) {
- for (auto MII = FirstMI; MII != LastMI; ++MII)
- if (MII->getDebugLoc())
- return MII->getDebugLoc();
- return DebugLoc();
+ DebugLoc DL;
+ for (auto MII = FirstMI; MII != LastMI; ++MII) {
+ if (DebugLoc MIIDL = MII->getDebugLoc()) {
+ if (MIIDL.getLine() != 0)
+ return MIIDL;
+ DL = MIIDL.get();
+ }
+ }
+ return DL;
}
/// Check if target reg is contained in given lists, which are:
@@ -136,6 +142,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
SmallSetVector<Register, 8> ExternUses;
SmallSet<Register, 8> KilledUseSet;
SmallSet<Register, 8> UndefUseSet;
+ SmallVector<std::pair<Register, Register>> TiedOperands;
for (auto MII = FirstMI; MII != LastMI; ++MII) {
// Debug instructions have no effects to track.
if (MII->isDebugInstr())
@@ -161,6 +168,15 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
// External def is now killed.
KilledUseSet.insert(Reg);
}
+ if (MO.isTied() && Reg.isVirtual()) {
+ // Record tied operand constraints that involve virtual registers so
+ // that bundles that are formed pre-register allocation reflect the
+ // relevant constraints.
+ unsigned TiedIdx = MII->findTiedOperandIdx(MO.getOperandNo());
+ MachineOperand &TiedMO = MII->getOperand(TiedIdx);
+ Register DefReg = TiedMO.getReg();
+ TiedOperands.emplace_back(DefReg, Reg);
+ }
}
}
@@ -203,7 +219,17 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
bool isKill = KilledUseSet.contains(Reg);
bool isUndef = UndefUseSet.contains(Reg);
MIB.addReg(Reg, getKillRegState(isKill) | getUndefRegState(isUndef) |
- getImplRegState(true));
+ getImplRegState(true));
+ }
+
+ for (auto [DefReg, UseReg] : TiedOperands) {
+ unsigned DefIdx =
+ std::distance(LocalDefs.begin(), llvm::find(LocalDefs, DefReg));
+ unsigned UseIdx =
+ std::distance(ExternUses.begin(), llvm::find(ExternUses, UseReg));
+ assert(DefIdx < LocalDefs.size());
+ assert(UseIdx < ExternUses.size());
+ MIB->tieOperands(DefIdx, LocalDefs.size() + UseIdx);
}
}
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index f18c051..7399370 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -2559,7 +2559,7 @@ init(ScheduleDAGMI *dag, const TargetSchedModel *smodel, SchedRemainder *rem) {
for (unsigned i = 0; i < ResourceCount; ++i) {
ReservedCyclesIndex[i] = NumUnits;
NumUnits += SchedModel->getProcResource(i)->NumUnits;
- if (isUnbufferedGroup(i)) {
+ if (isReservedGroup(i)) {
auto SubUnits = SchedModel->getProcResource(i)->SubUnitsIdxBegin;
for (unsigned U = 0, UE = SchedModel->getProcResource(i)->NumUnits;
U != UE; ++U)
@@ -2631,7 +2631,7 @@ SchedBoundary::getNextResourceCycle(const MCSchedClassDesc *SC, unsigned PIdx,
assert(NumberOfInstances > 0 &&
"Cannot have zero instances of a ProcResource");
- if (isUnbufferedGroup(PIdx)) {
+ if (isReservedGroup(PIdx)) {
// If any subunits are used by the instruction, report that the
// subunits of the resource group are available at the first cycle
// in which the unit is available, effectively removing the group
diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index c0710c4..fdf1048 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -2584,6 +2584,14 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
report("Extra explicit operand on non-variadic instruction", MO, MONum);
}
+ // Verify earlyClobber def operand
+ if (MCID.getOperandConstraint(MONum, MCOI::EARLY_CLOBBER) != -1) {
+ if (!MO->isReg())
+ report("Early clobber must be a register", MI);
+ if (!MO->isEarlyClobber())
+ report("Missing earlyClobber flag", MI);
+ }
+
switch (MO->getType()) {
case MachineOperand::MO_Register: {
// Verify debug flag on debug instructions. Check this first because reg0
diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
index 620d3d3..d738dc4 100644
--- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -244,7 +244,7 @@ static bool canEmitMemcpy(const TargetMachine *TM, Function *F) {
if (!TM)
return true;
const TargetLowering *TLI = TM->getSubtargetImpl(*F)->getTargetLowering();
- return TLI->getMemcpyName() != nullptr;
+ return TLI->getMemcpyImpl() != RTLIB::Unsupported;
}
// Return a value appropriate for use with the memset_pattern16 libcall, if
diff --git a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
index 40a8907..61706e1 100644
--- a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
+++ b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
@@ -193,7 +193,6 @@ void ReachingDefInfo::processDefs(MachineInstr *MI) {
for (auto &MO : MI->operands()) {
if (MO.isFI()) {
int FrameIndex = MO.getIndex();
- assert(FrameIndex >= 0 && "Can't handle negative frame indicies yet!");
if (!isFIDef(*MI, FrameIndex, TII))
continue;
MBBFrameObjsReachingDefs[{MBBNumber, FrameIndex}].push_back(CurInstr);
@@ -302,8 +301,6 @@ void ReachingDefInfo::print(raw_ostream &OS) {
Register Reg;
if (MO.isFI()) {
int FrameIndex = MO.getIndex();
- assert(FrameIndex >= 0 &&
- "Can't handle negative frame indicies yet!");
Reg = Register::index2StackSlot(FrameIndex);
} else if (MO.isReg()) {
if (MO.isDef())
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index 38f6deb..99f7693 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -1600,6 +1600,22 @@ bool RegisterCoalescer::reMaterializeDef(const CoalescerPair &CP,
SlotIndex DefIndex =
CurrIdx.getRegSlot(NewMI.getOperand(0).isEarlyClobber());
VNInfo::Allocator &Alloc = LIS->getVNInfoAllocator();
+
+ // Refine the subranges that are now defined by the remat.
+ // This will split existing subranges if necessary.
+ DstInt.refineSubRanges(
+ Alloc, DstMask,
+ [&DefIndex, &Alloc](LiveInterval::SubRange &SR) {
+ // We know that this lane is defined by this instruction,
+ // but at this point it might not be live because it was not defined
+ // by the original instruction. This happens when the
+ // rematerialization widens the defined register. Assign that lane a
+ // dead def so that the interferences are properly modeled.
+ if (!SR.liveAt(DefIndex))
+ SR.createDeadDef(DefIndex, Alloc);
+ },
+ *LIS->getSlotIndexes(), *TRI);
+
for (LiveInterval::SubRange &SR : DstInt.subranges()) {
if ((SR.LaneMask & DstMask).none()) {
LLVM_DEBUG(dbgs()
@@ -1617,14 +1633,6 @@ bool RegisterCoalescer::reMaterializeDef(const CoalescerPair &CP,
// updateRegDefUses. The original subrange def may have only undefed
// some lanes.
UpdatedSubRanges = true;
- } else {
- // We know that this lane is defined by this instruction,
- // but at this point it might not be live because it was not defined
- // by the original instruction. This happens when the
- // rematerialization widens the defined register. Assign that lane a
- // dead def so that the interferences are properly modeled.
- if (!SR.liveAt(DefIndex))
- SR.createDeadDef(DefIndex, Alloc);
}
}
if (UpdatedSubRanges)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 46c4bb8..816b7ba 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -4046,6 +4046,8 @@ static SDValue foldSubCtlzNot(SDNode *N, SelectionDAG &DAG) {
m_ConstInt(AndMask)))) {
// Type Legalisation Pattern:
// (sub (ctlz (and (xor Op XorMask) AndMask)) BitWidthDiff)
+ if (BitWidthDiff.getZExtValue() >= BitWidth)
+ return SDValue();
unsigned AndMaskWidth = BitWidth - BitWidthDiff.getZExtValue();
if (!(AndMask.isMask(AndMaskWidth) && XorMask.countr_one() >= AndMaskWidth))
return SDValue();
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 431a810..316aacd 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -163,6 +163,8 @@ private:
RTLIB::Libcall CallI128);
void ExpandDivRemLibCall(SDNode *Node, SmallVectorImpl<SDValue> &Results);
+ SDValue ExpandSincosStretLibCall(SDNode *Node) const;
+
SDValue EmitStackConvert(SDValue SrcOp, EVT SlotVT, EVT DestVT,
const SDLoc &dl);
SDValue EmitStackConvert(SDValue SrcOp, EVT SlotVT, EVT DestVT,
@@ -2423,6 +2425,101 @@ static bool useSinCos(SDNode *Node) {
return false;
}
+SDValue SelectionDAGLegalize::ExpandSincosStretLibCall(SDNode *Node) const {
+ // For iOS, we want to call an alternative entry point: __sincos_stret,
+ // which returns the values in two S / D registers.
+ SDLoc dl(Node);
+ SDValue Arg = Node->getOperand(0);
+ EVT ArgVT = Arg.getValueType();
+ RTLIB::Libcall LC = RTLIB::getSINCOS_STRET(ArgVT);
+ RTLIB::LibcallImpl SincosStret = TLI.getLibcallImpl(LC);
+ if (SincosStret == RTLIB::Unsupported)
+ return SDValue();
+
+ /// There are 3 different ABI cases to handle:
+ /// - Direct return of separate fields in registers
+ /// - Single return as vector elements
+ /// - sret struct
+
+ const RTLIB::RuntimeLibcallsInfo &CallsInfo = TLI.getRuntimeLibcallsInfo();
+
+ const DataLayout &DL = DAG.getDataLayout();
+
+ auto [FuncTy, FuncAttrs] = CallsInfo.getFunctionTy(
+ *DAG.getContext(), TM.getTargetTriple(), DL, SincosStret);
+
+ Type *SincosStretRetTy = FuncTy->getReturnType();
+ CallingConv::ID CallConv = CallsInfo.getLibcallImplCallingConv(SincosStret);
+ StringRef LibcallImplName = CallsInfo.getLibcallImplName(SincosStret);
+
+ SDValue Callee = DAG.getExternalSymbol(LibcallImplName.data(),
+ TLI.getProgramPointerTy(DL));
+
+ TargetLowering::ArgListTy Args;
+ SDValue SRet;
+
+ int FrameIdx;
+ if (FuncTy->getParamType(0)->isPointerTy()) {
+ // Uses sret
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+
+ AttributeSet PtrAttrs = FuncAttrs.getParamAttrs(0);
+ Type *StructTy = PtrAttrs.getStructRetType();
+ const uint64_t ByteSize = DL.getTypeAllocSize(StructTy);
+ const Align StackAlign = DL.getPrefTypeAlign(StructTy);
+
+ FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
+ SRet = DAG.getFrameIndex(FrameIdx, TLI.getFrameIndexTy(DL));
+
+ TargetLowering::ArgListEntry Entry(SRet, FuncTy->getParamType(0));
+ Entry.IsSRet = true;
+ Entry.IndirectType = StructTy;
+ Entry.Alignment = StackAlign;
+
+ Args.push_back(Entry);
+ Args.emplace_back(Arg, FuncTy->getParamType(1));
+ } else {
+ Args.emplace_back(Arg, FuncTy->getParamType(0));
+ }
+
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(dl)
+ .setChain(DAG.getEntryNode())
+ .setLibCallee(CallConv, SincosStretRetTy, Callee, std::move(Args))
+ .setIsPostTypeLegalization();
+
+ std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
+
+ if (SRet) {
+ MachinePointerInfo PtrInfo =
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
+ SDValue LoadSin = DAG.getLoad(ArgVT, dl, CallResult.second, SRet, PtrInfo);
+
+ TypeSize StoreSize = ArgVT.getStoreSize();
+
+ // Address of cos field.
+ SDValue Add = DAG.getObjectPtrOffset(dl, SRet, StoreSize);
+ SDValue LoadCos = DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add,
+ PtrInfo.getWithOffset(StoreSize));
+
+ SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
+ return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, LoadSin.getValue(0),
+ LoadCos.getValue(0));
+ }
+
+ if (!CallResult.first.getValueType().isVector())
+ return CallResult.first;
+
+ SDValue SinVal =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
+ DAG.getVectorIdxConstant(0, dl));
+ SDValue CosVal =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
+ DAG.getVectorIdxConstant(1, dl));
+ SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
+ return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
+}
+
SDValue SelectionDAGLegalize::expandLdexp(SDNode *Node) const {
SDLoc dl(Node);
EVT VT = Node->getValueType(0);
@@ -4730,6 +4827,18 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
case ISD::FSINCOS:
case ISD::FSINCOSPI: {
EVT VT = Node->getValueType(0);
+
+ if (Node->getOpcode() == ISD::FSINCOS) {
+ RTLIB::Libcall SincosStret = RTLIB::getSINCOS_STRET(VT);
+ if (SincosStret != RTLIB::UNKNOWN_LIBCALL) {
+ if (SDValue Expanded = ExpandSincosStretLibCall(Node)) {
+ Results.push_back(Expanded);
+ Results.push_back(Expanded.getValue(1));
+ break;
+ }
+ }
+ }
+
RTLIB::Libcall LC = Node->getOpcode() == ISD::FSINCOS
? RTLIB::getSINCOS(VT)
: RTLIB::getSINCOSPI(VT);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 379242e..ff6a7b2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -9257,21 +9257,22 @@ SDValue SelectionDAG::getMemcpy(
// FIXME: pass in SDLoc
TargetLowering::CallLoweringInfo CLI(*this);
bool IsTailCall = false;
- const char *MemCpyName = TLI->getMemcpyName();
+ RTLIB::LibcallImpl MemCpyImpl = TLI->getMemcpyImpl();
if (OverrideTailCall.has_value()) {
IsTailCall = *OverrideTailCall;
} else {
- bool LowersToMemcpy = StringRef(MemCpyName) == StringRef("memcpy");
+ bool LowersToMemcpy = MemCpyImpl == RTLIB::impl_memcpy;
IsTailCall = isInTailCallPositionWrapper(CI, this, LowersToMemcpy);
}
CLI.setDebugLoc(dl)
.setChain(Chain)
.setLibCallee(
- TLI->getLibcallCallingConv(RTLIB::MEMCPY),
+ TLI->getLibcallImplCallingConv(MemCpyImpl),
Dst.getValueType().getTypeForEVT(*getContext()),
- getExternalSymbol(MemCpyName, TLI->getPointerTy(getDataLayout())),
+ getExternalSymbol(TLI->getLibcallImplName(MemCpyImpl).data(),
+ TLI->getPointerTy(getDataLayout())),
std::move(Args))
.setDiscardResult()
.setTailCall(IsTailCall);
@@ -9361,22 +9362,24 @@ SDValue SelectionDAG::getMemmove(SDValue Chain, const SDLoc &dl, SDValue Dst,
// FIXME: pass in SDLoc
TargetLowering::CallLoweringInfo CLI(*this);
+ RTLIB::LibcallImpl MemmoveImpl = TLI->getLibcallImpl(RTLIB::MEMMOVE);
+
bool IsTailCall = false;
if (OverrideTailCall.has_value()) {
IsTailCall = *OverrideTailCall;
} else {
- bool LowersToMemmove =
- TLI->getLibcallName(RTLIB::MEMMOVE) == StringRef("memmove");
+ bool LowersToMemmove = MemmoveImpl == RTLIB::impl_memmove;
IsTailCall = isInTailCallPositionWrapper(CI, this, LowersToMemmove);
}
CLI.setDebugLoc(dl)
.setChain(Chain)
- .setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMMOVE),
- Dst.getValueType().getTypeForEVT(*getContext()),
- getExternalSymbol(TLI->getLibcallName(RTLIB::MEMMOVE),
- TLI->getPointerTy(getDataLayout())),
- std::move(Args))
+ .setLibCallee(
+ TLI->getLibcallImplCallingConv(MemmoveImpl),
+ Dst.getValueType().getTypeForEVT(*getContext()),
+ getExternalSymbol(TLI->getLibcallImplName(MemmoveImpl).data(),
+ TLI->getPointerTy(getDataLayout())),
+ std::move(Args))
.setDiscardResult()
.setTailCall(IsTailCall);
@@ -9492,8 +9495,10 @@ SDValue SelectionDAG::getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst,
TLI->getPointerTy(DL)),
std::move(Args));
}
- bool LowersToMemset =
- TLI->getLibcallName(RTLIB::MEMSET) == StringRef("memset");
+
+ RTLIB::LibcallImpl MemsetImpl = TLI->getLibcallImpl(RTLIB::MEMSET);
+ bool LowersToMemset = MemsetImpl == RTLIB::impl_memset;
+
// If we're going to use bzero, make sure not to tail call unless the
// subsequent return doesn't need a value, as bzero doesn't return the first
// arg unlike memset.
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index fa0c899..9961c98 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -3526,8 +3526,7 @@ void SelectionDAGBuilder::visitCallBr(const CallBrInst &I) {
// Update successor info.
addSuccessorWithProb(CallBrMBB, Return, BranchProbability::getOne());
- for (unsigned i = 0, e = I.getNumIndirectDests(); i < e; ++i) {
- BasicBlock *Dest = I.getIndirectDest(i);
+ for (BasicBlock *Dest : I.getIndirectDests()) {
MachineBasicBlock *Target = FuncInfo.getMBB(Dest);
Target->setIsInlineAsmBrIndirectTarget();
// If we introduce a type of asm goto statement that is permitted to use an
@@ -5313,18 +5312,26 @@ void SelectionDAGBuilder::visitAtomicStore(const StoreInst &I) {
DAG.setRoot(OutChain);
}
-/// visitTargetIntrinsic - Lower a call of a target intrinsic to an INTRINSIC
-/// node.
-void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
- unsigned Intrinsic) {
- // Ignore the callsite's attributes. A specific call site may be marked with
- // readnone, but the lowering code will expect the chain based on the
- // definition.
+/// Check if this intrinsic call depends on the chain (1st return value)
+/// and if it only *loads* memory.
+/// Ignore the callsite's attributes. A specific call site may be marked with
+/// readnone, but the lowering code will expect the chain based on the
+/// definition.
+std::pair<bool, bool>
+SelectionDAGBuilder::getTargetIntrinsicCallProperties(const CallBase &I) {
const Function *F = I.getCalledFunction();
bool HasChain = !F->doesNotAccessMemory();
bool OnlyLoad =
HasChain && F->onlyReadsMemory() && F->willReturn() && F->doesNotThrow();
+ return {HasChain, OnlyLoad};
+}
+
+SmallVector<SDValue, 8> SelectionDAGBuilder::getTargetIntrinsicOperands(
+ const CallBase &I, bool HasChain, bool OnlyLoad,
+ TargetLowering::IntrinsicInfo *TgtMemIntrinsicInfo) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
// Build the operand list.
SmallVector<SDValue, 8> Ops;
if (HasChain) { // If this intrinsic has side-effects, chainify it.
@@ -5336,17 +5343,10 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
}
}
- // Info is set by getTgtMemIntrinsic
- TargetLowering::IntrinsicInfo Info;
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- bool IsTgtIntrinsic = TLI.getTgtMemIntrinsic(Info, I,
- DAG.getMachineFunction(),
- Intrinsic);
-
// Add the intrinsic ID as an integer operand if it's not a target intrinsic.
- if (!IsTgtIntrinsic || Info.opc == ISD::INTRINSIC_VOID ||
- Info.opc == ISD::INTRINSIC_W_CHAIN)
- Ops.push_back(DAG.getTargetConstant(Intrinsic, getCurSDLoc(),
+ if (!TgtMemIntrinsicInfo || TgtMemIntrinsicInfo->opc == ISD::INTRINSIC_VOID ||
+ TgtMemIntrinsicInfo->opc == ISD::INTRINSIC_W_CHAIN)
+ Ops.push_back(DAG.getTargetConstant(I.getIntrinsicID(), getCurSDLoc(),
TLI.getPointerTy(DAG.getDataLayout())));
// Add all operands of the call to the operand list.
@@ -5369,13 +5369,85 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
}
}
+ if (std::optional<OperandBundleUse> Bundle =
+ I.getOperandBundle(LLVMContext::OB_convergencectrl)) {
+ Value *Token = Bundle->Inputs[0].get();
+ SDValue ConvControlToken = getValue(Token);
+ assert(Ops.back().getValueType() != MVT::Glue &&
+ "Did not expect another glue node here.");
+ ConvControlToken =
+ DAG.getNode(ISD::CONVERGENCECTRL_GLUE, {}, MVT::Glue, ConvControlToken);
+ Ops.push_back(ConvControlToken);
+ }
+
+ return Ops;
+}
+
+SDVTList SelectionDAGBuilder::getTargetIntrinsicVTList(const CallBase &I,
+ bool HasChain) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
SmallVector<EVT, 4> ValueVTs;
ComputeValueVTs(TLI, DAG.getDataLayout(), I.getType(), ValueVTs);
if (HasChain)
ValueVTs.push_back(MVT::Other);
- SDVTList VTs = DAG.getVTList(ValueVTs);
+ return DAG.getVTList(ValueVTs);
+}
+
+/// Get an INTRINSIC node for a target intrinsic which does not touch memory.
+SDValue SelectionDAGBuilder::getTargetNonMemIntrinsicNode(
+ const Type &IntrinsicVT, bool HasChain, ArrayRef<SDValue> Ops,
+ const SDVTList &VTs) {
+ if (!HasChain)
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, getCurSDLoc(), VTs, Ops);
+ if (!IntrinsicVT.isVoidTy())
+ return DAG.getNode(ISD::INTRINSIC_W_CHAIN, getCurSDLoc(), VTs, Ops);
+ return DAG.getNode(ISD::INTRINSIC_VOID, getCurSDLoc(), VTs, Ops);
+}
+
+/// Set root, convert return type if necessary and check alignment.
+SDValue SelectionDAGBuilder::handleTargetIntrinsicRet(const CallBase &I,
+ bool HasChain,
+ bool OnlyLoad,
+ SDValue Result) {
+ if (HasChain) {
+ SDValue Chain = Result.getValue(Result.getNode()->getNumValues() - 1);
+ if (OnlyLoad)
+ PendingLoads.push_back(Chain);
+ else
+ DAG.setRoot(Chain);
+ }
+
+ if (I.getType()->isVoidTy())
+ return Result;
+
+ if (MaybeAlign Alignment = I.getRetAlign(); InsertAssertAlign && Alignment) {
+ // Insert `assertalign` node if there's an alignment.
+ Result = DAG.getAssertAlign(getCurSDLoc(), Result, Alignment.valueOrOne());
+ } else if (!isa<VectorType>(I.getType())) {
+ Result = lowerRangeToAssertZExt(DAG, I, Result);
+ }
+
+ return Result;
+}
+
+/// visitTargetIntrinsic - Lower a call of a target intrinsic to an INTRINSIC
+/// node.
+void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
+ unsigned Intrinsic) {
+ auto [HasChain, OnlyLoad] = getTargetIntrinsicCallProperties(I);
+
+ // Info is set by getTgtMemIntrinsic
+ TargetLowering::IntrinsicInfo Info;
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ bool IsTgtMemIntrinsic =
+ TLI.getTgtMemIntrinsic(Info, I, DAG.getMachineFunction(), Intrinsic);
+
+ SmallVector<SDValue, 8> Ops = getTargetIntrinsicOperands(
+ I, HasChain, OnlyLoad, IsTgtMemIntrinsic ? &Info : nullptr);
+ SDVTList VTs = getTargetIntrinsicVTList(I, HasChain);
// Propagate fast-math-flags from IR to node(s).
SDNodeFlags Flags;
@@ -5386,19 +5458,9 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
// Create the node.
SDValue Result;
- if (auto Bundle = I.getOperandBundle(LLVMContext::OB_convergencectrl)) {
- auto *Token = Bundle->Inputs[0].get();
- SDValue ConvControlToken = getValue(Token);
- assert(Ops.back().getValueType() != MVT::Glue &&
- "Did not expected another glue node here.");
- ConvControlToken =
- DAG.getNode(ISD::CONVERGENCECTRL_GLUE, {}, MVT::Glue, ConvControlToken);
- Ops.push_back(ConvControlToken);
- }
-
// In some cases, custom collection of operands from CallInst I may be needed.
TLI.CollectTargetIntrinsicOperands(I, Ops, DAG);
- if (IsTgtIntrinsic) {
+ if (IsTgtMemIntrinsic) {
// This is target intrinsic that touches memory
//
// TODO: We currently just fallback to address space 0 if getTgtMemIntrinsic
@@ -5418,34 +5480,11 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
Info.ssid, Info.order, Info.failureOrder);
Result =
DAG.getMemIntrinsicNode(Info.opc, getCurSDLoc(), VTs, Ops, MemVT, MMO);
- } else if (!HasChain) {
- Result = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, getCurSDLoc(), VTs, Ops);
- } else if (!I.getType()->isVoidTy()) {
- Result = DAG.getNode(ISD::INTRINSIC_W_CHAIN, getCurSDLoc(), VTs, Ops);
} else {
- Result = DAG.getNode(ISD::INTRINSIC_VOID, getCurSDLoc(), VTs, Ops);
+ Result = getTargetNonMemIntrinsicNode(*I.getType(), HasChain, Ops, VTs);
}
- if (HasChain) {
- SDValue Chain = Result.getValue(Result.getNode()->getNumValues()-1);
- if (OnlyLoad)
- PendingLoads.push_back(Chain);
- else
- DAG.setRoot(Chain);
- }
-
- if (!I.getType()->isVoidTy()) {
- if (!isa<VectorType>(I.getType()))
- Result = lowerRangeToAssertZExt(DAG, I, Result);
-
- MaybeAlign Alignment = I.getRetAlign();
-
- // Insert `assertalign` node if there's an alignment.
- if (InsertAssertAlign && Alignment) {
- Result =
- DAG.getAssertAlign(getCurSDLoc(), Result, Alignment.valueOrOne());
- }
- }
+ Result = handleTargetIntrinsicRet(I, HasChain, OnlyLoad, Result);
setValue(&I, Result);
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 47e19f7..ed63bee 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -727,6 +727,17 @@ private:
MCSymbol *&BeginLabel);
SDValue lowerEndEH(SDValue Chain, const InvokeInst *II,
const BasicBlock *EHPadBB, MCSymbol *BeginLabel);
+
+ std::pair<bool, bool> getTargetIntrinsicCallProperties(const CallBase &I);
+ SmallVector<SDValue, 8> getTargetIntrinsicOperands(
+ const CallBase &I, bool HasChain, bool OnlyLoad,
+ TargetLowering::IntrinsicInfo *TgtMemIntrinsicInfo = nullptr);
+ SDVTList getTargetIntrinsicVTList(const CallBase &I, bool HasChain);
+ SDValue getTargetNonMemIntrinsicNode(const Type &IntrinsicVT, bool HasChain,
+ ArrayRef<SDValue> Ops,
+ const SDVTList &VTs);
+ SDValue handleTargetIntrinsicRet(const CallBase &I, bool HasChain,
+ bool OnlyLoad, SDValue Result);
};
/// This struct represents the registers (physical or virtual)
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index b3535eac..1cc591c 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -697,9 +697,11 @@ ISD::CondCode TargetLoweringBase::getSoftFloatCmpLibcallPredicate(
/// NOTE: The TargetMachine owns TLOF.
TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm)
- : TM(tm), Libcalls(TM.getTargetTriple(), TM.Options.ExceptionModel,
- TM.Options.FloatABIType, TM.Options.EABIVersion,
- TM.Options.MCOptions.getABIName()) {
+ : TM(tm),
+ RuntimeLibcallInfo(TM.getTargetTriple(), TM.Options.ExceptionModel,
+ TM.Options.FloatABIType, TM.Options.EABIVersion,
+ TM.Options.MCOptions.getABIName()),
+ Libcalls(RuntimeLibcallInfo) {
initActions();
// Perform these initializations only once.
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
index 6c78ef0..7496c5a 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -704,7 +704,9 @@ void DWARFDie::dump(raw_ostream &OS, unsigned Indent,
DIDumpOptions ChildDumpOpts = DumpOpts;
ChildDumpOpts.ShowParents = false;
while (Child) {
- Child.dump(OS, Indent + 2, ChildDumpOpts);
+ if (DumpOpts.FilterChildTag.empty() ||
+ llvm::is_contained(DumpOpts.FilterChildTag, Child.getTag()))
+ Child.dump(OS, Indent + 2, ChildDumpOpts);
Child = Child.getSibling();
}
}
diff --git a/llvm/lib/Demangle/MicrosoftDemangle.cpp b/llvm/lib/Demangle/MicrosoftDemangle.cpp
index b22928b..0aefe6e 100644
--- a/llvm/lib/Demangle/MicrosoftDemangle.cpp
+++ b/llvm/lib/Demangle/MicrosoftDemangle.cpp
@@ -277,6 +277,18 @@ demanglePointerCVQualifiers(std::string_view &MangledName) {
DEMANGLE_UNREACHABLE;
}
+static NodeArrayNode *nodeListToNodeArray(ArenaAllocator &Arena, NodeList *Head,
+ size_t Count) {
+ NodeArrayNode *N = Arena.alloc<NodeArrayNode>();
+ N->Count = Count;
+ N->Nodes = Arena.allocArray<Node *>(Count);
+ for (size_t I = 0; I < Count; ++I) {
+ N->Nodes[I] = Head->N;
+ Head = Head->Next;
+ }
+ return N;
+}
+
std::string_view Demangler::copyString(std::string_view Borrowed) {
char *Stable = Arena.allocUnalignedBuffer(Borrowed.size());
// This is not a micro-optimization, it avoids UB, should Borrowed be an null
@@ -323,8 +335,30 @@ Demangler::demangleSpecialTableSymbolNode(std::string_view &MangledName,
}
std::tie(STSN->Quals, IsMember) = demangleQualifiers(MangledName);
- if (!consumeFront(MangledName, '@'))
- STSN->TargetName = demangleFullyQualifiedTypeName(MangledName);
+
+ NodeList *TargetCurrent = nullptr;
+ NodeList *TargetHead = nullptr;
+ size_t Count = 0;
+ while (!consumeFront(MangledName, '@')) {
+ ++Count;
+
+ NodeList *Next = Arena.alloc<NodeList>();
+ if (TargetCurrent)
+ TargetCurrent->Next = Next;
+ else
+ TargetHead = Next;
+
+ TargetCurrent = Next;
+ QualifiedNameNode *QN = demangleFullyQualifiedTypeName(MangledName);
+ if (Error)
+ return nullptr;
+ assert(QN);
+ TargetCurrent->N = QN;
+ }
+
+ if (Count > 0)
+ STSN->TargetNames = nodeListToNodeArray(Arena, TargetHead, Count);
+
return STSN;
}
@@ -1605,18 +1639,6 @@ Demangler::demangleNameScopePiece(std::string_view &MangledName) {
return demangleSimpleName(MangledName, /*Memorize=*/true);
}
-static NodeArrayNode *nodeListToNodeArray(ArenaAllocator &Arena, NodeList *Head,
- size_t Count) {
- NodeArrayNode *N = Arena.alloc<NodeArrayNode>();
- N->Count = Count;
- N->Nodes = Arena.allocArray<Node *>(Count);
- for (size_t I = 0; I < Count; ++I) {
- N->Nodes[I] = Head->N;
- Head = Head->Next;
- }
- return N;
-}
-
QualifiedNameNode *
Demangler::demangleNameScopeChain(std::string_view &MangledName,
IdentifierNode *UnqualifiedName) {
diff --git a/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp b/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp
index 61e4961..17c6aab 100644
--- a/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp
+++ b/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp
@@ -662,9 +662,9 @@ void VcallThunkIdentifierNode::output(OutputBuffer &OB,
void SpecialTableSymbolNode::output(OutputBuffer &OB, OutputFlags Flags) const {
outputQualifiers(OB, Quals, false, true);
Name->output(OB, Flags);
- if (TargetName) {
+ if (TargetNames) {
OB << "{for `";
- TargetName->output(OB, Flags);
+ TargetNames->output(OB, Flags, "'s `");
OB << "'}";
}
}
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryResolver.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryResolver.cpp
index 35da82a..7e1d528 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryResolver.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryResolver.cpp
@@ -184,9 +184,9 @@ class SymbolSearchContext {
public:
SymbolSearchContext(SymbolQuery &Q) : Q(Q) {}
- bool hasSearched(LibraryInfo *Lib) const { return Searched.count(Lib); }
+ bool hasSearched(const LibraryInfo *Lib) const { return Searched.count(Lib); }
- void markSearched(LibraryInfo *Lib) { Searched.insert(Lib); }
+ void markSearched(const LibraryInfo *Lib) { Searched.insert(Lib); }
inline bool allResolved() const { return Q.allResolved(); }
@@ -194,7 +194,7 @@ public:
private:
SymbolQuery &Q;
- DenseSet<LibraryInfo *> Searched;
+ DenseSet<const LibraryInfo *> Searched;
};
void LibraryResolver::resolveSymbolsInLibrary(
@@ -226,19 +226,18 @@ void LibraryResolver::resolveSymbolsInLibrary(
return EnumerateResult::Continue;
},
Opts);
+ };
+ if (!Lib.hasFilter()) {
+ LLVM_DEBUG(dbgs() << "Building filter for library: " << Lib.getFullPath()
+ << "\n";);
+ enumerateSymbolsIfNeeded();
if (DiscoveredSymbols.empty()) {
LLVM_DEBUG(dbgs() << " No symbols and remove library : "
<< Lib.getFullPath() << "\n";);
LibMgr.removeLibrary(Lib.getFullPath());
return;
}
- };
-
- if (!Lib.hasFilter()) {
- LLVM_DEBUG(dbgs() << "Building filter for library: " << Lib.getFullPath()
- << "\n";);
- enumerateSymbolsIfNeeded();
SmallVector<StringRef> SymbolVec;
SymbolVec.reserve(DiscoveredSymbols.size());
for (const auto &KV : DiscoveredSymbols)
@@ -288,11 +287,15 @@ void LibraryResolver::searchSymbolsInLibraries(
SymbolSearchContext Ctx(Q);
while (!Ctx.allResolved()) {
+ std::vector<std::shared_ptr<LibraryInfo>> Libs;
+ LibMgr.getLibraries(S, K, Libs, [&](const LibraryInfo &Lib) {
+ return !Ctx.hasSearched(&Lib);
+ });
- for (auto &Lib : LibMgr.getView(S, K)) {
- if (Ctx.hasSearched(Lib.get()))
- continue;
+ if (Libs.empty() && !scanLibrariesIfNeeded(K, scanBatchSize))
+ break; // no more new libs to scan
+ for (auto &Lib : Libs) {
// can use Async here?
resolveSymbolsInLibrary(*Lib, Ctx.query(), Config.Options);
Ctx.markSearched(Lib.get());
@@ -300,12 +303,6 @@ void LibraryResolver::searchSymbolsInLibraries(
if (Ctx.allResolved())
return;
}
-
- if (Ctx.allResolved())
- return;
-
- if (!scanLibrariesIfNeeded(K, scanBatchSize))
- break; // no more new libs to scan
}
};
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryScanner.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryScanner.cpp
index d93f686..32f6dbe 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryScanner.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryScanner.cpp
@@ -50,7 +50,7 @@ void handleError(Error Err, StringRef context = "") {
}
bool ObjectFileLoader::isArchitectureCompatible(const object::ObjectFile &Obj) {
- Triple HostTriple(sys::getDefaultTargetTriple());
+ Triple HostTriple(sys::getProcessTriple());
Triple ObjTriple = Obj.makeTriple();
LLVM_DEBUG({
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 95d954f..0c8565c 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -758,14 +758,12 @@ void TypePrinting::printStructBody(StructType *STy, raw_ostream &OS) {
AbstractSlotTrackerStorage::~AbstractSlotTrackerStorage() = default;
-namespace llvm {
-
//===----------------------------------------------------------------------===//
// SlotTracker Class: Enumerate slot numbers for unnamed values
//===----------------------------------------------------------------------===//
/// This class provides computation of slot numbers for LLVM Assembly writing.
///
-class SlotTracker : public AbstractSlotTrackerStorage {
+class llvm::SlotTracker : public AbstractSlotTrackerStorage {
public:
/// ValueMap - A mapping of Values to slot numbers.
using ValueMap = DenseMap<const Value *, unsigned>;
@@ -943,8 +941,6 @@ private:
void processDbgRecordMetadata(const DbgRecord &DVR);
};
-} // end namespace llvm
-
ModuleSlotTracker::ModuleSlotTracker(SlotTracker &Machine, const Module *M,
const Function *F)
: M(M), F(F), Machine(&Machine) {}
diff --git a/llvm/lib/IR/DebugLoc.cpp b/llvm/lib/IR/DebugLoc.cpp
index 01dafca..bfba6e0 100644
--- a/llvm/lib/IR/DebugLoc.cpp
+++ b/llvm/lib/IR/DebugLoc.cpp
@@ -10,10 +10,11 @@
#include "llvm/Config/llvm-config.h"
#include "llvm/IR/DebugInfo.h"
+using namespace llvm;
+
#if LLVM_ENABLE_DEBUGLOC_TRACKING_ORIGIN
#include "llvm/Support/Signals.h"
-namespace llvm {
DbgLocOrigin::DbgLocOrigin(bool ShouldCollectTrace) {
if (!ShouldCollectTrace)
return;
@@ -30,11 +31,8 @@ void DbgLocOrigin::addTrace() {
auto &[Depth, StackTrace] = StackTraces.emplace_back();
Depth = sys::getStackTrace(StackTrace);
}
-} // namespace llvm
#endif
-using namespace llvm;
-
#if LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE
DILocAndCoverageTracking::DILocAndCoverageTracking(const DILocation *L)
: TrackingMDNodeRef(const_cast<DILocation *>(L)), DbgLocOrigin(!L),
diff --git a/llvm/lib/IR/DebugProgramInstruction.cpp b/llvm/lib/IR/DebugProgramInstruction.cpp
index d9357bb..6b1fd39 100644
--- a/llvm/lib/IR/DebugProgramInstruction.cpp
+++ b/llvm/lib/IR/DebugProgramInstruction.cpp
@@ -12,8 +12,9 @@
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/Support/Compiler.h"
-namespace llvm {
+using namespace llvm;
+namespace llvm {
template <typename T>
DbgRecordParamRef<T>::DbgRecordParamRef(const T *Param)
: Ref(const_cast<T *>(Param)) {}
@@ -28,6 +29,7 @@ template <typename T> T *DbgRecordParamRef<T>::get() const {
template class LLVM_EXPORT_TEMPLATE DbgRecordParamRef<DIExpression>;
template class LLVM_EXPORT_TEMPLATE DbgRecordParamRef<DILabel>;
template class LLVM_EXPORT_TEMPLATE DbgRecordParamRef<DILocalVariable>;
+} // namespace llvm
DbgVariableRecord::DbgVariableRecord(const DbgVariableIntrinsic *DVI)
: DbgRecord(ValueKind, DVI->getDebugLoc()),
@@ -755,5 +757,3 @@ iterator_range<simple_ilist<DbgRecord>::iterator> DbgMarker::cloneDebugInfoFrom(
// We inserted a block at the end, return that range.
return {First->getIterator(), StoredDbgRecords.end()};
}
-
-} // end namespace llvm
diff --git a/llvm/lib/IR/FPEnv.cpp b/llvm/lib/IR/FPEnv.cpp
index 67f21d3..c41d7b3 100644
--- a/llvm/lib/IR/FPEnv.cpp
+++ b/llvm/lib/IR/FPEnv.cpp
@@ -19,9 +19,10 @@
#include "llvm/IR/Intrinsics.h"
#include <optional>
-namespace llvm {
+using namespace llvm;
-std::optional<RoundingMode> convertStrToRoundingMode(StringRef RoundingArg) {
+std::optional<RoundingMode>
+llvm::convertStrToRoundingMode(StringRef RoundingArg) {
// For dynamic rounding mode, we use round to nearest but we will set the
// 'exact' SDNodeFlag so that the value will not be rounded.
return StringSwitch<std::optional<RoundingMode>>(RoundingArg)
@@ -34,7 +35,8 @@ std::optional<RoundingMode> convertStrToRoundingMode(StringRef RoundingArg) {
.Default(std::nullopt);
}
-std::optional<StringRef> convertRoundingModeToStr(RoundingMode UseRounding) {
+std::optional<StringRef>
+llvm::convertRoundingModeToStr(RoundingMode UseRounding) {
std::optional<StringRef> RoundingStr;
switch (UseRounding) {
case RoundingMode::Dynamic:
@@ -62,7 +64,7 @@ std::optional<StringRef> convertRoundingModeToStr(RoundingMode UseRounding) {
}
std::optional<fp::ExceptionBehavior>
-convertStrToExceptionBehavior(StringRef ExceptionArg) {
+llvm::convertStrToExceptionBehavior(StringRef ExceptionArg) {
return StringSwitch<std::optional<fp::ExceptionBehavior>>(ExceptionArg)
.Case("fpexcept.ignore", fp::ebIgnore)
.Case("fpexcept.maytrap", fp::ebMayTrap)
@@ -71,7 +73,7 @@ convertStrToExceptionBehavior(StringRef ExceptionArg) {
}
std::optional<StringRef>
-convertExceptionBehaviorToStr(fp::ExceptionBehavior UseExcept) {
+llvm::convertExceptionBehaviorToStr(fp::ExceptionBehavior UseExcept) {
std::optional<StringRef> ExceptStr;
switch (UseExcept) {
case fp::ebStrict:
@@ -87,7 +89,7 @@ convertExceptionBehaviorToStr(fp::ExceptionBehavior UseExcept) {
return ExceptStr;
}
-Intrinsic::ID getConstrainedIntrinsicID(const Instruction &Instr) {
+Intrinsic::ID llvm::getConstrainedIntrinsicID(const Instruction &Instr) {
Intrinsic::ID IID = Intrinsic::not_intrinsic;
switch (Instr.getOpcode()) {
case Instruction::FCmp:
@@ -127,5 +129,3 @@ Intrinsic::ID getConstrainedIntrinsicID(const Instruction &Instr) {
return IID;
}
-
-} // namespace llvm
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index 88dbd17..95edb2e 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -1019,8 +1019,7 @@ Value *IRBuilderBase::CreateSelectWithUnknownProfile(Value *C, Value *True,
const Twine &Name) {
Value *Ret = CreateSelectFMF(C, True, False, {}, Name);
if (auto *SI = dyn_cast<SelectInst>(Ret)) {
- setExplicitlyUnknownBranchWeightsIfProfiled(
- *SI, *SI->getParent()->getParent(), PassName);
+ setExplicitlyUnknownBranchWeightsIfProfiled(*SI, PassName);
}
return Ret;
}
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index 3b8fde8..cd39970 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -4171,6 +4171,16 @@ SwitchInstProfUpdateWrapper::removeCase(SwitchInst::CaseIt I) {
return SI.removeCase(I);
}
+void SwitchInstProfUpdateWrapper::replaceDefaultDest(SwitchInst::CaseIt I) {
+ auto *DestBlock = I->getCaseSuccessor();
+ if (Weights) {
+ auto Weight = getSuccessorWeight(I->getCaseIndex() + 1);
+ (*Weights)[0] = Weight.value();
+ }
+
+ SI.setDefaultDest(DestBlock);
+}
+
void SwitchInstProfUpdateWrapper::addCase(
ConstantInt *OnVal, BasicBlock *Dest,
SwitchInstProfUpdateWrapper::CaseWeightOpt W) {
diff --git a/llvm/lib/IR/Operator.cpp b/llvm/lib/IR/Operator.cpp
index 39e5463c..c3e54a0 100644
--- a/llvm/lib/IR/Operator.cpp
+++ b/llvm/lib/IR/Operator.cpp
@@ -17,7 +17,8 @@
#include "ConstantsContext.h"
-namespace llvm {
+using namespace llvm;
+
bool Operator::hasPoisonGeneratingFlags() const {
switch (getOpcode()) {
case Instruction::Add:
@@ -288,4 +289,3 @@ void FastMathFlags::print(raw_ostream &O) const {
O << " afn";
}
}
-} // namespace llvm
diff --git a/llvm/lib/IR/PassTimingInfo.cpp b/llvm/lib/IR/PassTimingInfo.cpp
index 4e27086..cb1b91a 100644
--- a/llvm/lib/IR/PassTimingInfo.cpp
+++ b/llvm/lib/IR/PassTimingInfo.cpp
@@ -32,10 +32,10 @@ using namespace llvm;
#define DEBUG_TYPE "time-passes"
-namespace llvm {
+using namespace llvm;
-bool TimePassesIsEnabled = false;
-bool TimePassesPerRun = false;
+bool llvm::TimePassesIsEnabled = false;
+bool llvm::TimePassesPerRun = false;
static cl::opt<bool, true> EnableTiming(
"time-passes", cl::location(TimePassesIsEnabled), cl::Hidden,
@@ -139,7 +139,7 @@ PassTimingInfo *PassTimingInfo::TheTimeInfo;
} // namespace legacy
} // namespace
-Timer *getPassTimer(Pass *P) {
+Timer *llvm::getPassTimer(Pass *P) {
legacy::PassTimingInfo::init();
if (legacy::PassTimingInfo::TheTimeInfo)
return legacy::PassTimingInfo::TheTimeInfo->getPassTimer(P, P);
@@ -148,7 +148,7 @@ Timer *getPassTimer(Pass *P) {
/// If timing is enabled, report the times collected up to now and then reset
/// them.
-void reportAndResetTimings(raw_ostream *OutStream) {
+void llvm::reportAndResetTimings(raw_ostream *OutStream) {
if (legacy::PassTimingInfo::TheTimeInfo)
legacy::PassTimingInfo::TheTimeInfo->print(OutStream);
}
@@ -315,5 +315,3 @@ void TimePassesHandler::registerCallbacks(PassInstrumentationCallbacks &PIC) {
PIC.registerAfterAnalysisCallback(
[this](StringRef P, Any) { this->stopAnalysisTimer(P); });
}
-
-} // namespace llvm
diff --git a/llvm/lib/IR/ProfDataUtils.cpp b/llvm/lib/IR/ProfDataUtils.cpp
index fc2be51..94dbe1f 100644
--- a/llvm/lib/IR/ProfDataUtils.cpp
+++ b/llvm/lib/IR/ProfDataUtils.cpp
@@ -274,9 +274,12 @@ void llvm::setExplicitlyUnknownBranchWeights(Instruction &I,
}
void llvm::setExplicitlyUnknownBranchWeightsIfProfiled(Instruction &I,
- Function &F,
- StringRef PassName) {
- if (std::optional<Function::ProfileCount> EC = F.getEntryCount();
+ StringRef PassName,
+ const Function *F) {
+ F = F ? F : I.getFunction();
+ assert(F && "Either pass a instruction attached to a Function, or explicitly "
+ "pass the Function that it will be attached to");
+ if (std::optional<Function::ProfileCount> EC = F->getEntryCount();
EC && EC->getCount() > 0)
setExplicitlyUnknownBranchWeights(I, PassName);
}
diff --git a/llvm/lib/IR/PseudoProbe.cpp b/llvm/lib/IR/PseudoProbe.cpp
index 59f218c..3c05f4b 100644
--- a/llvm/lib/IR/PseudoProbe.cpp
+++ b/llvm/lib/IR/PseudoProbe.cpp
@@ -19,9 +19,7 @@
using namespace llvm;
-namespace llvm {
-
-std::optional<PseudoProbe>
+static std::optional<PseudoProbe>
extractProbeFromDiscriminator(const DILocation *DIL) {
if (DIL) {
auto Discriminator = DIL->getDiscriminator();
@@ -43,7 +41,7 @@ extractProbeFromDiscriminator(const DILocation *DIL) {
return std::nullopt;
}
-std::optional<PseudoProbe>
+static std::optional<PseudoProbe>
extractProbeFromDiscriminator(const Instruction &Inst) {
assert(isa<CallBase>(&Inst) && !isa<IntrinsicInst>(&Inst) &&
"Only call instructions should have pseudo probe encodes as their "
@@ -53,7 +51,7 @@ extractProbeFromDiscriminator(const Instruction &Inst) {
return std::nullopt;
}
-std::optional<PseudoProbe> extractProbe(const Instruction &Inst) {
+std::optional<PseudoProbe> llvm::extractProbe(const Instruction &Inst) {
if (const auto *II = dyn_cast<PseudoProbeInst>(&Inst)) {
PseudoProbe Probe;
Probe.Id = II->getIndex()->getZExtValue();
@@ -73,7 +71,7 @@ std::optional<PseudoProbe> extractProbe(const Instruction &Inst) {
return std::nullopt;
}
-void setProbeDistributionFactor(Instruction &Inst, float Factor) {
+void llvm::setProbeDistributionFactor(Instruction &Inst, float Factor) {
assert(Factor >= 0 && Factor <= 1 &&
"Distribution factor must be in [0, 1.0]");
if (auto *II = dyn_cast<PseudoProbeInst>(&Inst)) {
@@ -111,5 +109,3 @@ void setProbeDistributionFactor(Instruction &Inst, float Factor) {
}
}
}
-
-} // namespace llvm
diff --git a/llvm/lib/IR/ReplaceConstant.cpp b/llvm/lib/IR/ReplaceConstant.cpp
index 962368f..b3586b4 100644
--- a/llvm/lib/IR/ReplaceConstant.cpp
+++ b/llvm/lib/IR/ReplaceConstant.cpp
@@ -16,7 +16,7 @@
#include "llvm/IR/Constants.h"
#include "llvm/IR/Instructions.h"
-namespace llvm {
+using namespace llvm;
static bool isExpandableUser(User *U) {
return isa<ConstantExpr>(U) || isa<ConstantAggregate>(U);
@@ -49,10 +49,10 @@ static SmallVector<Instruction *, 4> expandUser(BasicBlock::iterator InsertPt,
return NewInsts;
}
-bool convertUsersOfConstantsToInstructions(ArrayRef<Constant *> Consts,
- Function *RestrictToFunc,
- bool RemoveDeadConstants,
- bool IncludeSelf) {
+bool llvm::convertUsersOfConstantsToInstructions(ArrayRef<Constant *> Consts,
+ Function *RestrictToFunc,
+ bool RemoveDeadConstants,
+ bool IncludeSelf) {
// Find all expandable direct users of Consts.
SmallVector<Constant *> Stack;
for (Constant *C : Consts) {
@@ -121,5 +121,3 @@ bool convertUsersOfConstantsToInstructions(ArrayRef<Constant *> Consts,
return Changed;
}
-
-} // namespace llvm
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index 77af29b..f4c5c6f 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -7,7 +7,9 @@
//===----------------------------------------------------------------------===//
#include "llvm/IR/RuntimeLibcalls.h"
+#include "llvm/ADT/FloatingPointMode.h"
#include "llvm/ADT/StringTable.h"
+#include "llvm/IR/Module.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/xxhash.h"
#include "llvm/TargetParser/ARMTargetParser.h"
@@ -17,11 +19,17 @@
using namespace llvm;
using namespace RTLIB;
+#define GET_RUNTIME_LIBCALLS_INFO
#define GET_INIT_RUNTIME_LIBCALL_NAMES
#define GET_SET_TARGET_RUNTIME_LIBCALL_SETS
#define DEFINE_GET_LOOKUP_LIBCALL_IMPL_NAME
#include "llvm/IR/RuntimeLibcalls.inc"
+RuntimeLibcallsInfo::RuntimeLibcallsInfo(const Module &M)
+ : RuntimeLibcallsInfo(M.getTargetTriple()) {
+ // TODO: Consider module flags
+}
+
/// Set default libcall names. If a target wants to opt-out of a libcall it
/// should be placed here.
void RuntimeLibcallsInfo::initLibcalls(const Triple &TT,
@@ -72,3 +80,80 @@ bool RuntimeLibcallsInfo::darwinHasExp10(const Triple &TT) {
return false;
}
}
+
+std::pair<FunctionType *, AttributeList>
+RuntimeLibcallsInfo::getFunctionTy(LLVMContext &Ctx, const Triple &TT,
+ const DataLayout &DL,
+ RTLIB::LibcallImpl LibcallImpl) const {
+ static constexpr Attribute::AttrKind CommonFnAttrs[] = {
+ Attribute::NoCallback, Attribute::NoFree, Attribute::NoSync,
+ Attribute::NoUnwind, Attribute::WillReturn};
+
+ switch (LibcallImpl) {
+ case RTLIB::impl___sincos_stret:
+ case RTLIB::impl___sincosf_stret: {
+ if (!darwinHasSinCosStret(TT)) // Non-darwin currently unexpected
+ return {};
+
+ Type *ScalarTy = LibcallImpl == RTLIB::impl___sincosf_stret
+ ? Type::getFloatTy(Ctx)
+ : Type::getDoubleTy(Ctx);
+
+ AttrBuilder FuncAttrBuilder(Ctx);
+ for (Attribute::AttrKind Attr : CommonFnAttrs)
+ FuncAttrBuilder.addAttribute(Attr);
+
+ const bool UseSret =
+ TT.isX86_32() || ((TT.isARM() || TT.isThumb()) &&
+ ARM::computeTargetABI(TT) == ARM::ARM_ABI_APCS);
+
+ FuncAttrBuilder.addMemoryAttr(MemoryEffects::argumentOrErrnoMemOnly(
+ UseSret ? ModRefInfo::Mod : ModRefInfo::NoModRef, ModRefInfo::Mod));
+
+ AttributeList Attrs;
+ Attrs = Attrs.addFnAttributes(Ctx, FuncAttrBuilder);
+
+ if (UseSret) {
+ AttrBuilder AttrBuilder(Ctx);
+ StructType *StructTy = StructType::get(ScalarTy, ScalarTy);
+ AttrBuilder.addStructRetAttr(StructTy);
+ AttrBuilder.addAlignmentAttr(DL.getABITypeAlign(StructTy));
+ FunctionType *FuncTy = FunctionType::get(
+ Type::getVoidTy(Ctx), {DL.getAllocaPtrType(Ctx), ScalarTy}, false);
+
+ return {FuncTy, Attrs.addParamAttributes(Ctx, 0, AttrBuilder)};
+ }
+
+ Type *RetTy =
+ LibcallImpl == RTLIB::impl___sincosf_stret && TT.isX86_64()
+ ? static_cast<Type *>(FixedVectorType::get(ScalarTy, 2))
+ : static_cast<Type *>(StructType::get(ScalarTy, ScalarTy));
+
+ return {FunctionType::get(RetTy, {ScalarTy}, false), Attrs};
+ }
+ case RTLIB::impl_sqrtf:
+ case RTLIB::impl_sqrt: {
+ AttrBuilder FuncAttrBuilder(Ctx);
+
+ for (Attribute::AttrKind Attr : CommonFnAttrs)
+ FuncAttrBuilder.addAttribute(Attr);
+ FuncAttrBuilder.addMemoryAttr(MemoryEffects::errnoMemOnly(ModRefInfo::Mod));
+
+ AttributeList Attrs;
+ Attrs = Attrs.addFnAttributes(Ctx, FuncAttrBuilder);
+
+ Type *ScalarTy = LibcallImpl == RTLIB::impl_sqrtf ? Type::getFloatTy(Ctx)
+ : Type::getDoubleTy(Ctx);
+ FunctionType *FuncTy = FunctionType::get(ScalarTy, {ScalarTy}, false);
+
+ Attrs = Attrs.addRetAttribute(
+ Ctx, Attribute::getWithNoFPClass(Ctx, fcNegInf | fcNegSubnormal |
+ fcNegNormal));
+ return {FuncTy, Attrs};
+ }
+ default:
+ return {};
+ }
+
+ return {};
+}
diff --git a/llvm/lib/IR/Use.cpp b/llvm/lib/IR/Use.cpp
index 67882ba..5042335 100644
--- a/llvm/lib/IR/Use.cpp
+++ b/llvm/lib/IR/Use.cpp
@@ -9,7 +9,7 @@
#include "llvm/IR/Use.h"
#include "llvm/IR/User.h"
-namespace llvm {
+using namespace llvm;
void Use::swap(Use &RHS) {
if (Val == RHS.Val)
@@ -42,5 +42,3 @@ void Use::zap(Use *Start, const Use *Stop, bool del) {
if (del)
::operator delete(Start);
}
-
-} // namespace llvm
diff --git a/llvm/lib/IR/User.cpp b/llvm/lib/IR/User.cpp
index ab44cb4..9bb7c12 100644
--- a/llvm/lib/IR/User.cpp
+++ b/llvm/lib/IR/User.cpp
@@ -11,8 +11,11 @@
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/IntrinsicInst.h"
+using namespace llvm;
+
namespace llvm {
class BasicBlock;
+}
//===----------------------------------------------------------------------===//
// User Class
@@ -214,5 +217,3 @@ LLVM_NO_SANITIZE_MEMORY_ATTRIBUTE void User::operator delete(void *Usr) {
::operator delete(Storage);
}
}
-
-} // namespace llvm
diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp
index 95d61a9..b775cbb 100644
--- a/llvm/lib/IR/Value.cpp
+++ b/llvm/lib/IR/Value.cpp
@@ -148,10 +148,18 @@ void Value::destroyValueName() {
}
bool Value::hasNUses(unsigned N) const {
+ if (!UseList)
+ return N == 0;
+
+ // TODO: Disallow for ConstantData and remove !UseList check?
return hasNItems(use_begin(), use_end(), N);
}
bool Value::hasNUsesOrMore(unsigned N) const {
+ // TODO: Disallow for ConstantData and remove !UseList check?
+ if (!UseList)
+ return N == 0;
+
return hasNItemsOrMore(use_begin(), use_end(), N);
}
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 7917712..24f90bf 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -136,9 +136,7 @@ static cl::opt<bool> VerifyNoAliasScopeDomination(
cl::desc("Ensure that llvm.experimental.noalias.scope.decl for identical "
"scopes are not dominating"));
-namespace llvm {
-
-struct VerifierSupport {
+struct llvm::VerifierSupport {
raw_ostream *OS;
const Module &M;
ModuleSlotTracker MST;
@@ -318,8 +316,6 @@ public:
}
};
-} // namespace llvm
-
namespace {
class Verifier : public InstVisitor<Verifier>, VerifierSupport {
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 23be42f..fefc733 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1396,11 +1396,10 @@ Error LTO::runRegularLTO(AddStreamFn AddStream) {
SmallVector<const char *> LTO::getRuntimeLibcallSymbols(const Triple &TT) {
RTLIB::RuntimeLibcallsInfo Libcalls(TT);
SmallVector<const char *> LibcallSymbols;
- ArrayRef<RTLIB::LibcallImpl> LibcallImpls = Libcalls.getLibcallImpls();
- LibcallSymbols.reserve(LibcallImpls.size());
+ LibcallSymbols.reserve(Libcalls.getNumAvailableLibcallImpls());
- for (RTLIB::LibcallImpl Impl : LibcallImpls) {
- if (Impl != RTLIB::Unsupported)
+ for (RTLIB::LibcallImpl Impl : RTLIB::libcall_impls()) {
+ if (Libcalls.isAvailable(Impl))
LibcallSymbols.push_back(Libcalls.getLibcallImplName(Impl).data());
}
diff --git a/llvm/lib/MC/SPIRVObjectWriter.cpp b/llvm/lib/MC/SPIRVObjectWriter.cpp
index 5e37137..d693ea3 100644
--- a/llvm/lib/MC/SPIRVObjectWriter.cpp
+++ b/llvm/lib/MC/SPIRVObjectWriter.cpp
@@ -7,6 +7,7 @@
//===----------------------------------------------------------------------===//
#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCSPIRVObjectWriter.h"
#include "llvm/MC/MCSection.h"
#include "llvm/MC/MCValue.h"
@@ -17,8 +18,10 @@ using namespace llvm;
void SPIRVObjectWriter::writeHeader(const MCAssembler &Asm) {
constexpr uint32_t MagicNumber = 0x07230203;
constexpr uint32_t GeneratorID = 43;
- constexpr uint32_t GeneratorMagicNumber =
- (GeneratorID << 16) | (LLVM_VERSION_MAJOR);
+ const uint32_t GeneratorMagicNumber =
+ Asm.getContext().getTargetTriple().getVendor() == Triple::AMD
+ ? UINT16_MAX
+ : ((GeneratorID << 16) | (LLVM_VERSION_MAJOR));
constexpr uint32_t Schema = 0;
W.write<uint32_t>(MagicNumber);
diff --git a/llvm/lib/Object/MachOObjectFile.cpp b/llvm/lib/Object/MachOObjectFile.cpp
index e09dc94..c2f4560 100644
--- a/llvm/lib/Object/MachOObjectFile.cpp
+++ b/llvm/lib/Object/MachOObjectFile.cpp
@@ -1978,20 +1978,42 @@ uint64_t MachOObjectFile::getSectionSize(DataRefImpl Sec) const {
return SectSize;
}
-ArrayRef<uint8_t> MachOObjectFile::getSectionContents(uint32_t Offset,
+ArrayRef<uint8_t> MachOObjectFile::getSectionContents(uint64_t Offset,
uint64_t Size) const {
return arrayRefFromStringRef(getData().substr(Offset, Size));
}
Expected<ArrayRef<uint8_t>>
MachOObjectFile::getSectionContents(DataRefImpl Sec) const {
- uint32_t Offset;
+ uint64_t Offset;
uint64_t Size;
if (is64Bit()) {
MachO::section_64 Sect = getSection64(Sec);
Offset = Sect.offset;
Size = Sect.size;
+ // Check for large mach-o files where the section contents might exceed
+ // 4GB. MachO::section_64 objects only have 32 bit file offsets to the
+ // section contents and can overflow in dSYM files. We can track this and
+ // adjust the section offset to be 64 bit safe. If sections overflow then
+ // section ordering is enforced. If sections are not ordered, then an error
+ // will be returned stopping invalid section data from being returned.
+ uint64_t PrevTrueOffset = 0;
+ uint64_t SectOffsetAdjust = 0;
+ for (uint32_t SectIdx = 0; SectIdx < Sec.d.a; ++SectIdx) {
+ MachO::section_64 CurrSect =
+ getStruct<MachO::section_64>(*this, Sections[SectIdx]);
+ uint64_t CurrTrueOffset = (uint64_t)CurrSect.offset + SectOffsetAdjust;
+ if ((SectOffsetAdjust > 0) && (PrevTrueOffset > CurrTrueOffset))
+ return malformedError("section data exceeds 4GB and section file "
+ "offsets are not ordered");
+ const uint64_t EndSectFileOffset =
+ (uint64_t)CurrSect.offset + CurrSect.size;
+ if (EndSectFileOffset > UINT32_MAX)
+ SectOffsetAdjust += EndSectFileOffset & 0xFFFFFFFF00000000ull;
+ PrevTrueOffset = CurrTrueOffset;
+ }
+ Offset += SectOffsetAdjust;
} else {
MachO::section Sect = getSection(Sec);
Offset = Sect.offset;
diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp
index e5e5fc2..29f2916 100644
--- a/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -37,8 +37,6 @@ unsigned Object::getMachine() const {
return *Header.Machine;
return llvm::ELF::EM_NONE;
}
-
-constexpr StringRef SectionHeaderTable::TypeStr;
} // namespace ELFYAML
namespace yaml {
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 3c9a27a..40ceb6f 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -67,6 +67,7 @@
#include "llvm/Analysis/PostDominators.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/RegionInfo.h"
+#include "llvm/Analysis/RuntimeLibcallInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
#include "llvm/Analysis/ScalarEvolutionDivision.h"
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index bd03ac0..3f41618 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -228,7 +228,7 @@ static cl::opt<bool> EnableLoopHeaderDuplication(
static cl::opt<bool>
EnableDFAJumpThreading("enable-dfa-jump-thread",
cl::desc("Enable DFA jump threading"),
- cl::init(false), cl::Hidden);
+ cl::init(true), cl::Hidden);
static cl::opt<bool>
EnableHotColdSplit("hot-cold-split",
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 1853cdd..d870f99 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -35,6 +35,7 @@ MODULE_ANALYSIS("no-op-module", NoOpModuleAnalysis())
MODULE_ANALYSIS("pass-instrumentation", PassInstrumentationAnalysis(PIC))
MODULE_ANALYSIS("profile-summary", ProfileSummaryAnalysis())
MODULE_ANALYSIS("reg-usage", PhysicalRegisterUsageAnalysis())
+MODULE_ANALYSIS("runtime-libcall-info", RuntimeLibraryAnalysis())
MODULE_ANALYSIS("stack-safety", StackSafetyGlobalAnalysis())
MODULE_ANALYSIS("verify", VerifierAnalysis())
diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp
index 708e79d..6c140be 100644
--- a/llvm/lib/Support/Mustache.cpp
+++ b/llvm/lib/Support/Mustache.cpp
@@ -34,6 +34,31 @@ static bool isContextFalsey(const json::Value *V) {
return isFalsey(*V);
}
+static void splitAndTrim(StringRef Str, SmallVectorImpl<StringRef> &Tokens) {
+ size_t CurrentPos = 0;
+ while (CurrentPos < Str.size()) {
+ // Find the next delimiter.
+ size_t DelimiterPos = Str.find('.', CurrentPos);
+
+ // If no delimiter is found, process the rest of the string.
+ if (DelimiterPos == StringRef::npos)
+ DelimiterPos = Str.size();
+
+ // Get the current part, which may have whitespace.
+ StringRef Part = Str.slice(CurrentPos, DelimiterPos);
+
+ // Manually trim the part without creating a new string object.
+ size_t Start = Part.find_first_not_of(" \t\r\n");
+ if (Start != StringRef::npos) {
+ size_t End = Part.find_last_not_of(" \t\r\n");
+ Tokens.push_back(Part.slice(Start, End + 1));
+ }
+
+ // Move past the delimiter for the next iteration.
+ CurrentPos = DelimiterPos + 1;
+ }
+}
+
static Accessor splitMustacheString(StringRef Str, MustacheContext &Ctx) {
// We split the mustache string into an accessor.
// For example:
@@ -46,13 +71,7 @@ static Accessor splitMustacheString(StringRef Str, MustacheContext &Ctx) {
// It's a literal, so it doesn't need to be saved.
Tokens.push_back(".");
} else {
- while (!Str.empty()) {
- StringRef Part;
- std::tie(Part, Str) = Str.split('.');
- // Each part of the accessor needs to be saved to the arena
- // to ensure it has a stable address.
- Tokens.push_back(Ctx.Saver.save(Part.trim()));
- }
+ splitAndTrim(Str, Tokens);
}
// Now, allocate memory for the array of StringRefs in the arena.
StringRef *ArenaTokens = Ctx.Allocator.Allocate<StringRef>(Tokens.size());
@@ -368,141 +387,99 @@ struct Tag {
llvm_unreachable("Unknown json::Value::Kind");
}
-static Tag findNextTag(StringRef Template, size_t StartPos, StringRef Open,
- StringRef Close) {
- const StringLiteral TripleOpen("{{{");
- const StringLiteral TripleClose("}}}");
-
- size_t NormalOpenPos = Template.find(Open, StartPos);
- size_t TripleOpenPos = Template.find(TripleOpen, StartPos);
-
- Tag Result;
-
- // Determine which tag comes first.
- if (TripleOpenPos != StringRef::npos &&
- (NormalOpenPos == StringRef::npos || TripleOpenPos <= NormalOpenPos)) {
- // Found a triple mustache tag.
- size_t EndPos =
- Template.find(TripleClose, TripleOpenPos + TripleOpen.size());
- if (EndPos == StringRef::npos)
- return Result; // No closing tag found.
-
- Result.TagKind = Tag::Kind::Triple;
- Result.StartPosition = TripleOpenPos;
- size_t ContentStart = TripleOpenPos + TripleOpen.size();
- Result.Content = Template.substr(ContentStart, EndPos - ContentStart);
- Result.FullMatch = Template.substr(
- TripleOpenPos, (EndPos + TripleClose.size()) - TripleOpenPos);
- } else if (NormalOpenPos != StringRef::npos) {
- // Found a normal mustache tag.
- size_t EndPos = Template.find(Close, NormalOpenPos + Open.size());
- if (EndPos == StringRef::npos)
- return Result; // No closing tag found.
-
- Result.TagKind = Tag::Kind::Normal;
- Result.StartPosition = NormalOpenPos;
- size_t ContentStart = NormalOpenPos + Open.size();
- Result.Content = Template.substr(ContentStart, EndPos - ContentStart);
- Result.FullMatch =
- Template.substr(NormalOpenPos, (EndPos + Close.size()) - NormalOpenPos);
- }
-
- return Result;
-}
-
-static std::optional<std::pair<StringRef, StringRef>>
-processTag(const Tag &T, SmallVectorImpl<Token> &Tokens, MustacheContext &Ctx) {
- LLVM_DEBUG(dbgs() << "[Tag] " << T.FullMatch << ", Content: " << T.Content
- << ", Kind: " << tagKindToString(T.TagKind) << "\n");
- if (T.TagKind == Tag::Kind::Triple) {
- Tokens.emplace_back(T.FullMatch, Ctx.Saver.save("&" + T.Content), '&', Ctx);
- return std::nullopt;
- }
- StringRef Interpolated = T.Content;
- if (!Interpolated.trim().starts_with("=")) {
- char Front = Interpolated.empty() ? ' ' : Interpolated.trim().front();
- Tokens.emplace_back(T.FullMatch, Interpolated, Front, Ctx);
- return std::nullopt;
- }
- Tokens.emplace_back(T.FullMatch, Interpolated, '=', Ctx);
- StringRef DelimSpec = Interpolated.trim();
- DelimSpec = DelimSpec.drop_front(1);
- DelimSpec = DelimSpec.take_until([](char C) { return C == '='; });
- DelimSpec = DelimSpec.trim();
-
- std::pair<StringRef, StringRef> Ret = DelimSpec.split(' ');
- LLVM_DEBUG(dbgs() << "[Set Delimiter] NewOpen: " << Ret.first
- << ", NewClose: " << Ret.second << "\n");
- return Ret;
-}
-
// Simple tokenizer that splits the template into tokens.
-// The mustache spec allows {{{ }}} to unescape variables,
-// but we don't support that here. An unescape variable
-// is represented only by {{& variable}}.
static SmallVector<Token> tokenize(StringRef Template, MustacheContext &Ctx) {
LLVM_DEBUG(dbgs() << "[Tokenize Template] \"" << Template << "\"\n");
SmallVector<Token> Tokens;
SmallString<8> Open("{{");
SmallString<8> Close("}}");
- size_t Start = 0;
+ size_t Cursor = 0;
+ size_t TextStart = 0;
- while (Start < Template.size()) {
- LLVM_DEBUG(dbgs() << "[Tokenize Loop] Start:" << Start << ", Open:'" << Open
- << "', Close:'" << Close << "'\n");
- Tag T = findNextTag(Template, Start, Open, Close);
+ const StringLiteral TripleOpen("{{{");
+ const StringLiteral TripleClose("}}}");
- if (T.TagKind == Tag::Kind::None) {
- // No more tags, the rest is text.
- Tokens.emplace_back(Template.substr(Start));
- break;
+ while (Cursor < Template.size()) {
+ StringRef TemplateSuffix = Template.substr(Cursor);
+ StringRef TagOpen, TagClose;
+ Tag::Kind Kind;
+
+ // Determine which tag we've encountered.
+ if (TemplateSuffix.starts_with(TripleOpen)) {
+ Kind = Tag::Kind::Triple;
+ TagOpen = TripleOpen;
+ TagClose = TripleClose;
+ } else if (TemplateSuffix.starts_with(Open)) {
+ Kind = Tag::Kind::Normal;
+ TagOpen = Open;
+ TagClose = Close;
+ } else {
+ // Not at a tag, continue scanning.
+ ++Cursor;
+ continue;
}
- // Add the text before the tag.
- if (T.StartPosition > Start) {
- StringRef Text = Template.substr(Start, T.StartPosition - Start);
- Tokens.emplace_back(Text);
+ // Found a tag, first add the preceding text.
+ if (Cursor > TextStart)
+ Tokens.emplace_back(Template.slice(TextStart, Cursor));
+
+ // Find the closing tag.
+ size_t EndPos = Template.find(TagClose, Cursor + TagOpen.size());
+ if (EndPos == StringRef::npos) {
+ // No closing tag, the rest is text.
+ Tokens.emplace_back(Template.substr(Cursor));
+ TextStart = Cursor = Template.size();
+ break;
}
- if (auto NewDelims = processTag(T, Tokens, Ctx)) {
- std::tie(Open, Close) = *NewDelims;
+ // Extract tag content and full match.
+ size_t ContentStart = Cursor + TagOpen.size();
+ StringRef Content = Template.substr(ContentStart, EndPos - ContentStart);
+ StringRef FullMatch =
+ Template.substr(Cursor, (EndPos + TagClose.size()) - Cursor);
+
+ // Process the tag (inlined logic from processTag).
+ LLVM_DEBUG(dbgs() << "[Tag] " << FullMatch << ", Content: " << Content
+ << ", Kind: " << tagKindToString(Kind) << "\n");
+ if (Kind == Tag::Kind::Triple) {
+ Tokens.emplace_back(FullMatch, Ctx.Saver.save("&" + Content), '&', Ctx);
+ } else { // Normal Tag
+ StringRef Interpolated = Content;
+ if (!Interpolated.trim().starts_with("=")) {
+ char Front = Interpolated.empty() ? ' ' : Interpolated.trim().front();
+ Tokens.emplace_back(FullMatch, Interpolated, Front, Ctx);
+ } else { // Set Delimiter
+ Tokens.emplace_back(FullMatch, Interpolated, '=', Ctx);
+ StringRef DelimSpec = Interpolated.trim();
+ DelimSpec = DelimSpec.drop_front(1);
+ DelimSpec = DelimSpec.take_until([](char C) { return C == '='; });
+ DelimSpec = DelimSpec.trim();
+
+ auto [NewOpen, NewClose] = DelimSpec.split(' ');
+ LLVM_DEBUG(dbgs() << "[Set Delimiter] NewOpen: " << NewOpen
+ << ", NewClose: " << NewClose << "\n");
+ Open = NewOpen;
+ Close = NewClose;
+ }
}
- // Move past the tag.
- Start = T.StartPosition + T.FullMatch.size();
+ // Move past the tag for the next iteration.
+ Cursor += FullMatch.size();
+ TextStart = Cursor;
}
- // Fix up white spaces for:
- // - open sections
- // - inverted sections
- // - close sections
- // - comments
- //
- // This loop attempts to find standalone tokens and tries to trim out
- // the surrounding whitespace.
- // For example:
- // if you have the template string
- // {{#section}} \n Example \n{{/section}}
- // The output should would be
- // For example:
- // \n Example \n
+ // Add any remaining text after the last tag.
+ if (TextStart < Template.size())
+ Tokens.emplace_back(Template.substr(TextStart));
+
+ // Fix up white spaces for standalone tags.
size_t LastIdx = Tokens.size() - 1;
for (size_t Idx = 0, End = Tokens.size(); Idx < End; ++Idx) {
Token &CurrentToken = Tokens[Idx];
Token::Type CurrentType = CurrentToken.getType();
- // Check if token type requires cleanup.
- bool RequiresCleanUp = requiresCleanUp(CurrentType);
-
- if (!RequiresCleanUp)
+ if (!requiresCleanUp(CurrentType))
continue;
- // We adjust the token body if there's no text behind or ahead.
- // A token is considered to have no text ahead if the right of the previous
- // token is a newline followed by spaces.
- // A token is considered to have no text behind if the left of the next
- // token is spaces followed by a newline.
- // eg.
- // "Line 1\n {{#section}} \n Line 2 \n {{/section}} \n Line 3"
bool HasTextBehind = hasTextBehind(Idx, Tokens);
bool HasTextAhead = hasTextAhead(Idx, Tokens);
@@ -622,9 +599,16 @@ void Parser::parseSection(ASTNode *Parent, ASTNode::Type Ty,
size_t Start = CurrentPtr;
parseMustache(CurrentNode);
const size_t End = CurrentPtr - 1;
+
+ size_t RawBodySize = 0;
+ for (size_t I = Start; I < End; ++I)
+ RawBodySize += Tokens[I].RawBody.size();
+
SmallString<128> RawBody;
- for (std::size_t I = Start; I < End; I++)
+ RawBody.reserve(RawBodySize);
+ for (std::size_t I = Start; I < End; ++I)
RawBody += Tokens[I].RawBody;
+
CurrentNode->setRawBody(Ctx.Saver.save(StringRef(RawBody)));
Parent->addChild(CurrentNode);
}
diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp
index afce803..8ad20b4 100644
--- a/llvm/lib/TableGen/Record.cpp
+++ b/llvm/lib/TableGen/Record.cpp
@@ -46,12 +46,11 @@ using namespace llvm;
// Context
//===----------------------------------------------------------------------===//
-namespace llvm::detail {
/// This class represents the internal implementation of the RecordKeeper.
/// It contains all of the contextual static state of the Record classes. It is
/// kept out-of-line to simplify dependencies, and also make it easier for
/// internal classes to access the uniquer state of the keeper.
-struct RecordKeeperImpl {
+struct detail::RecordKeeperImpl {
RecordKeeperImpl(RecordKeeper &RK)
: SharedBitRecTy(RK), SharedIntRecTy(RK), SharedStringRecTy(RK),
SharedDagRecTy(RK), AnyRecord(RK, {}), TheUnsetInit(RK),
@@ -99,7 +98,6 @@ struct RecordKeeperImpl {
void dumpAllocationStats(raw_ostream &OS) const;
};
-} // namespace llvm::detail
void detail::RecordKeeperImpl::dumpAllocationStats(raw_ostream &OS) const {
// Dump memory allocation related stats.
diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
index 1b5a713..34c85d5 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
@@ -601,6 +601,12 @@ def CSR_Win_AArch64_AAPCS_SwiftError
def CSR_Win_AArch64_AAPCS_SwiftTail
: CalleeSavedRegs<(sub CSR_Win_AArch64_AAPCS, X20, X22)>;
+def CSR_Win_AArch64_RT_MostRegs
+ : CalleeSavedRegs<(add CSR_Win_AArch64_AAPCS, (sequence "X%u", 9, 15))>;
+
+def CSR_Win_AArch64_RT_AllRegs
+ : CalleeSavedRegs<(add CSR_Win_AArch64_RT_MostRegs, (sequence "Q%u", 8, 31))>;
+
// The Control Flow Guard check call uses a custom calling convention that also
// preserves X0-X8 and Q0-Q7.
def CSR_Win_AArch64_CFGuard_Check : CalleeSavedRegs<(add CSR_Win_AArch64_AAPCS,
diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
index cf34498..18e246e 100644
--- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
@@ -81,10 +81,7 @@ namespace {
class AArch64FastISel final : public FastISel {
class Address {
public:
- using BaseKind = enum {
- RegBase,
- FrameIndexBase
- };
+ enum BaseKind { RegBase, FrameIndexBase };
private:
BaseKind Kind = RegBase;
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 0f7b34c..3ee4d58 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -2380,13 +2380,6 @@ void AArch64FrameLowering::determineStackHazardSlot(
return;
}
- const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
- if (MFI.hasVarSizedObjects() || TRI->hasStackRealignment(MF)) {
- LLVM_DEBUG(dbgs() << "SplitSVEObjects is not supported with variable "
- "sized objects or realignment\n");
- return;
- }
-
// If another calling convention is explicitly set FPRs can't be promoted to
// ZPR callee-saves.
if (!is_contained({CallingConv::C, CallingConv::Fast,
@@ -2402,6 +2395,7 @@ void AArch64FrameLowering::determineStackHazardSlot(
assert(Subtarget.isSVEorStreamingSVEAvailable() &&
"Expected SVE to be available for PPRs");
+ const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
// With SplitSVEObjects the CS hazard padding is placed between the
// PPRs and ZPRs. If there are any FPR CS there would be a hazard between
// them and the CS GRPs. Avoid this by promoting all FPR CS to ZPRs.
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 60aa61e..40e6400 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -50,6 +50,7 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SDPatternMatch.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/CodeGen/TargetCallingConv.h"
@@ -104,7 +105,6 @@
#include <vector>
using namespace llvm;
-using namespace llvm::PatternMatch;
#define DEBUG_TYPE "aarch64-lower"
@@ -1052,15 +1052,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
// Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
- if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
- getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
- // Issue __sincos_stret if available.
- setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
- setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
- } else {
- setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
- setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
- }
+ // Issue __sincos_stret if available.
+ setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
// Make floating-point constants legal for the large code model, so they don't
// become loads from the constant pool.
@@ -1180,6 +1174,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::SHL);
setTargetDAGCombine(ISD::VECTOR_DEINTERLEAVE);
+ setTargetDAGCombine(ISD::CTPOP);
// In case of strict alignment, avoid an excessive number of byte wide stores.
MaxStoresPerMemsetOptSize = 8;
@@ -5346,35 +5341,6 @@ SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
return SDValue();
}
-SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
- SelectionDAG &DAG) const {
- // For iOS, we want to call an alternative entry point: __sincos_stret,
- // which returns the values in two S / D registers.
- SDLoc DL(Op);
- SDValue Arg = Op.getOperand(0);
- EVT ArgVT = Arg.getValueType();
- Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
-
- ArgListTy Args;
- Args.emplace_back(Arg, ArgTy);
-
- RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
- : RTLIB::SINCOS_STRET_F32;
- const char *LibcallName = getLibcallName(LC);
- SDValue Callee =
- DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
-
- StructType *RetTy = StructType::get(ArgTy, ArgTy);
- TargetLowering::CallLoweringInfo CLI(DAG);
- CallingConv::ID CC = getLibcallCallingConv(LC);
- CLI.setDebugLoc(DL)
- .setChain(DAG.getEntryNode())
- .setLibCallee(CC, RetTy, Callee, std::move(Args));
-
- std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
- return CallResult.first;
-}
-
static MVT getSVEContainerType(EVT ContentTy);
SDValue
@@ -7723,8 +7689,6 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
case ISD::FP_TO_SINT_SAT:
case ISD::FP_TO_UINT_SAT:
return LowerFP_TO_INT_SAT(Op, DAG);
- case ISD::FSINCOS:
- return LowerFSINCOS(Op, DAG);
case ISD::GET_ROUNDING:
return LowerGET_ROUNDING(Op, DAG);
case ISD::SET_ROUNDING:
@@ -11367,9 +11331,10 @@ SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
break;
}
+ // Note: This lowering only overrides NEON for v1i64 and v2i64, where we
+ // prefer using SVE if available.
if (VT.isScalableVector() ||
- useSVEForFixedLengthVectorVT(
- VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
+ useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)) {
switch (Opcode) {
default:
llvm_unreachable("Wrong instruction");
@@ -17591,6 +17556,7 @@ bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(
// udot instruction.
if (SrcWidth * 4 <= DstWidth) {
if (all_of(I->users(), [&](auto *U) {
+ using namespace llvm::PatternMatch;
auto *SingleUser = cast<Instruction>(&*U);
if (match(SingleUser, m_c_Mul(m_Specific(I), m_SExt(m_Value()))))
return true;
@@ -17862,6 +17828,7 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
// into shift / and masks. For the moment we do this just for uitofp (not
// zext) to avoid issues with widening instructions.
if (Shuffles.size() == 4 && all_of(Shuffles, [](ShuffleVectorInst *SI) {
+ using namespace llvm::PatternMatch;
return SI->hasOneUse() && match(SI->user_back(), m_UIToFP(m_Value())) &&
SI->getType()->getScalarSizeInBits() * 4 ==
SI->user_back()->getType()->getScalarSizeInBits();
@@ -27878,6 +27845,35 @@ static SDValue performRNDRCombine(SDNode *N, SelectionDAG &DAG) {
{A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
}
+static SDValue performCTPOPCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG) {
+ using namespace llvm::SDPatternMatch;
+ if (!DCI.isBeforeLegalize())
+ return SDValue();
+
+ // ctpop(zext(bitcast(vector_mask))) -> neg(signed_reduce_add(vector_mask))
+ SDValue Mask;
+ if (!sd_match(N->getOperand(0), m_ZExt(m_BitCast(m_Value(Mask)))))
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+ EVT MaskVT = Mask.getValueType();
+
+ if (VT.isVector() || !MaskVT.isFixedLengthVector() ||
+ MaskVT.getVectorElementType() != MVT::i1)
+ return SDValue();
+
+ EVT ReduceInVT =
+ EVT::getVectorVT(*DAG.getContext(), VT, MaskVT.getVectorElementCount());
+
+ SDLoc DL(N);
+ // Sign extend to best fit ZeroOrNegativeOneBooleanContent.
+ SDValue ExtMask = DAG.getNode(ISD::SIGN_EXTEND, DL, ReduceInVT, Mask);
+ SDValue NegPopCount = DAG.getNode(ISD::VECREDUCE_ADD, DL, VT, ExtMask);
+ return DAG.getNegative(NegPopCount, DL, VT);
+}
+
SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -28223,6 +28219,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performScalarToVectorCombine(N, DCI, DAG);
case ISD::SHL:
return performSHLCombine(N, DCI, DAG);
+ case ISD::CTPOP:
+ return performCTPOPCombine(N, DCI, DAG);
}
return SDValue();
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 2cb8ed2..70bfae7 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -745,7 +745,6 @@ private:
SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerLOOP_DEPENDENCE_MASK(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVSCALE(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
index 30b7b03..52b216c 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
@@ -197,6 +197,12 @@ def G_SMULL : AArch64GenericInstruction {
let hasSideEffects = 0;
}
+def G_PMULL : AArch64GenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type1:$src1, type1:$src2);
+ let hasSideEffects = 0;
+}
+
def G_UADDLP : AArch64GenericInstruction {
let OutOperandList = (outs type0:$dst);
let InOperandList = (ins type0:$src1);
@@ -273,6 +279,7 @@ def : GINodeEquiv<G_FCMGT, AArch64fcmgt>;
def : GINodeEquiv<G_BSP, AArch64bsp>;
+def : GINodeEquiv<G_PMULL, AArch64pmull>;
def : GINodeEquiv<G_UMULL, AArch64umull>;
def : GINodeEquiv<G_SMULL, AArch64smull>;
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index e69fa32..2ab7bf1 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -1386,6 +1386,25 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
if (MOP.isReg() && MOP.isKill())
DefinedInBB.addReg(MOP.getReg());
+ // Copy over any implicit-def operands. This is like MI.copyImplicitOps, but
+ // only copies implicit defs and makes sure that each operand is only added
+ // once in case of duplicates.
+ auto CopyImplicitOps = [&](MachineBasicBlock::iterator MI1,
+ MachineBasicBlock::iterator MI2) {
+ SmallSetVector<Register, 4> Ops;
+ for (const MachineOperand &MO :
+ llvm::drop_begin(MI1->operands(), MI1->getDesc().getNumOperands()))
+ if (MO.isReg() && MO.isImplicit() && MO.isDef())
+ Ops.insert(MO.getReg());
+ for (const MachineOperand &MO :
+ llvm::drop_begin(MI2->operands(), MI2->getDesc().getNumOperands()))
+ if (MO.isReg() && MO.isImplicit() && MO.isDef())
+ Ops.insert(MO.getReg());
+ for (auto Op : Ops)
+ MIB.addDef(Op, RegState::Implicit);
+ };
+ CopyImplicitOps(I, Paired);
+
// Erase the old instructions.
I->eraseFromParent();
Paired->eraseFromParent();
diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
index 45b7120..4df4d54 100644
--- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
@@ -805,7 +805,7 @@ void AArch64PrologueEmitter::emitPrologue() {
CFAOffset += SVEAllocs.BeforePPRs;
assert(PPRRange.End == ZPRRange.Begin &&
"Expected ZPR callee saves after PPR locals");
- allocateStackSpace(PPRRange.End, RealignmentPadding, SVEAllocs.AfterPPRs,
+ allocateStackSpace(PPRRange.End, 0, SVEAllocs.AfterPPRs,
EmitAsyncCFI && !HasFP, CFAOffset,
MFI.hasVarSizedObjects() || SVEAllocs.AfterZPRs);
CFAOffset += SVEAllocs.AfterPPRs;
@@ -1318,6 +1318,26 @@ AArch64EpilogueEmitter::AArch64EpilogueEmitter(MachineFunction &MF,
SEHEpilogueStartI = MBB.end();
}
+void AArch64EpilogueEmitter::moveSPBelowFP(MachineBasicBlock::iterator MBBI,
+ StackOffset Offset) {
+ // Other combinations could be supported, but are not currently needed.
+ assert(Offset.getScalable() < 0 && Offset.getFixed() <= 0 &&
+ "expected negative offset (with optional fixed portion)");
+ Register Base = AArch64::FP;
+ if (int64_t FixedOffset = Offset.getFixed()) {
+ // If we have a negative fixed offset, we need to first subtract it in a
+ // temporary register first (to avoid briefly deallocating the scalable
+ // portion of the offset).
+ Base = MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+ emitFrameOffset(MBB, MBBI, DL, Base, AArch64::FP,
+ StackOffset::getFixed(FixedOffset), TII,
+ MachineInstr::FrameDestroy);
+ }
+ emitFrameOffset(MBB, MBBI, DL, AArch64::SP, Base,
+ StackOffset::getScalable(Offset.getScalable()), TII,
+ MachineInstr::FrameDestroy);
+}
+
void AArch64EpilogueEmitter::emitEpilogue() {
MachineBasicBlock::iterator EpilogueEndI = MBB.getLastNonDebugInstr();
if (MBB.end() != EpilogueEndI) {
@@ -1418,6 +1438,7 @@ void AArch64EpilogueEmitter::emitEpilogue() {
AfterCSRPopSize += ProloguePopSize;
}
}
+
// Move past the restores of the callee-saved registers.
// If we plan on combining the sp bump of the local stack size and the callee
// save stack size, we might need to adjust the CSR save and restore offsets.
@@ -1483,7 +1504,6 @@ void AArch64EpilogueEmitter::emitEpilogue() {
StackOffset SVECalleeSavesSize = ZPR.CalleeSavesSize + PPR.CalleeSavesSize;
SVEStackAllocations SVEAllocs = getSVEStackAllocations({PPR, ZPR});
- MachineBasicBlock::iterator RestoreBegin = ZPRRange.Begin;
// Deallocate the SVE area.
if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) {
@@ -1510,28 +1530,25 @@ void AArch64EpilogueEmitter::emitEpilogue() {
(AFI->isStackRealigned() || MFI.hasVarSizedObjects()) ? AArch64::FP
: AArch64::SP;
if (SVECalleeSavesSize && BaseForSVEDealloc == AArch64::FP) {
- // TODO: Support stack realigment and variable-sized objects.
- assert(
- SVELayout != SVEStackLayout::Split &&
- "unexpected stack realignment or variable sized objects with split "
- "SVE stack objects");
-
- Register CalleeSaveBase = AArch64::FP;
- if (int64_t CalleeSaveBaseOffset =
- AFI->getCalleeSaveBaseToFrameRecordOffset()) {
- // If we have have an non-zero offset to the non-SVE CS base we need to
- // compute the base address by subtracting the offest in a temporary
- // register first (to avoid briefly deallocating the SVE CS).
- CalleeSaveBase = MBB.getParent()->getRegInfo().createVirtualRegister(
- &AArch64::GPR64RegClass);
- emitFrameOffset(MBB, RestoreBegin, DL, CalleeSaveBase, AArch64::FP,
- StackOffset::getFixed(-CalleeSaveBaseOffset), TII,
- MachineInstr::FrameDestroy);
+ if (ZPR.CalleeSavesSize || SVELayout != SVEStackLayout::Split) {
+ // The offset from the frame-pointer to the start of the ZPR saves.
+ StackOffset FPOffsetZPR =
+ -SVECalleeSavesSize - PPR.LocalsSize -
+ StackOffset::getFixed(AFI->getCalleeSaveBaseToFrameRecordOffset());
+ // Deallocate the stack space space by moving the SP to the start of the
+ // ZPR/PPR callee-save area.
+ moveSPBelowFP(ZPRRange.Begin, FPOffsetZPR);
+ }
+ // With split SVE, the predicates are stored in a separate area above the
+ // ZPR saves, so we must adjust the stack to the start of the PPRs.
+ if (PPR.CalleeSavesSize && SVELayout == SVEStackLayout::Split) {
+ // The offset from the frame-pointer to the start of the PPR saves.
+ StackOffset FPOffsetPPR = -PPR.CalleeSavesSize;
+ // Move to the start of the PPR area.
+ assert(!FPOffsetPPR.getFixed() && "expected only scalable offset");
+ emitFrameOffset(MBB, ZPRRange.End, DL, AArch64::SP, AArch64::FP,
+ FPOffsetPPR, TII, MachineInstr::FrameDestroy);
}
- // The code below will deallocate the stack space space by moving the SP
- // to the start of the SVE callee-save area.
- emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, CalleeSaveBase,
- -SVECalleeSavesSize, TII, MachineInstr::FrameDestroy);
} else if (BaseForSVEDealloc == AArch64::SP) {
auto NonSVELocals = StackOffset::getFixed(NumBytes);
auto CFAOffset = NonSVELocals + StackOffset::getFixed(PrologueSaveSize) +
diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h
index 6e0e283..7f297b5 100644
--- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h
+++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h
@@ -180,6 +180,10 @@ public:
private:
bool shouldCombineCSRLocalStackBump(uint64_t StackBumpBytes) const;
+ /// A helper for moving the SP to a negative offset from the FP, without
+ /// deallocating any stack in the range FP to FP + Offset.
+ void moveSPBelowFP(MachineBasicBlock::iterator MBBI, StackOffset Offset);
+
void emitSwiftAsyncContextFramePointer(MachineBasicBlock::iterator MBBI,
const DebugLoc &DL) const;
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 5bfb19d9..a5048b9 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -90,6 +90,16 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
if (MF->getSubtarget<AArch64Subtarget>().isTargetDarwin())
return getDarwinCalleeSavedRegs(MF);
+ if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost)
+ return MF->getSubtarget<AArch64Subtarget>().isTargetWindows()
+ ? CSR_Win_AArch64_RT_MostRegs_SaveList
+ : CSR_AArch64_RT_MostRegs_SaveList;
+
+ if (MF->getFunction().getCallingConv() == CallingConv::PreserveAll)
+ return MF->getSubtarget<AArch64Subtarget>().isTargetWindows()
+ ? CSR_Win_AArch64_RT_AllRegs_SaveList
+ : CSR_AArch64_RT_AllRegs_SaveList;
+
if (MF->getFunction().getCallingConv() == CallingConv::CFGuard_Check)
return CSR_Win_AArch64_CFGuard_Check_SaveList;
if (MF->getSubtarget<AArch64Subtarget>().isTargetWindows()) {
@@ -138,10 +148,6 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
return CSR_AArch64_AAPCS_SwiftError_SaveList;
if (MF->getFunction().getCallingConv() == CallingConv::SwiftTail)
return CSR_AArch64_AAPCS_SwiftTail_SaveList;
- if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost)
- return CSR_AArch64_RT_MostRegs_SaveList;
- if (MF->getFunction().getCallingConv() == CallingConv::PreserveAll)
- return CSR_AArch64_RT_AllRegs_SaveList;
if (MF->getFunction().getCallingConv() == CallingConv::Win64)
// This is for OSes other than Windows; Windows is a separate case further
// above.
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 5b5565a..197aae6 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3007,9 +3007,9 @@ AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
llvm_unreachable("Unsupported register kind");
}
-bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
- ArrayRef<const Value *> Args,
- Type *SrcOverrideTy) const {
+bool AArch64TTIImpl::isSingleExtWideningInstruction(
+ unsigned Opcode, Type *DstTy, ArrayRef<const Value *> Args,
+ Type *SrcOverrideTy) const {
// A helper that returns a vector type from the given type. The number of
// elements in type Ty determines the vector width.
auto toVectorTy = [&](Type *ArgTy) {
@@ -3027,48 +3027,29 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
(DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
return false;
- // Determine if the operation has a widening variant. We consider both the
- // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
- // instructions.
- //
- // TODO: Add additional widening operations (e.g., shl, etc.) once we
- // verify that their extending operands are eliminated during code
- // generation.
Type *SrcTy = SrcOverrideTy;
switch (Opcode) {
- case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
- case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
+ case Instruction::Add: // UADDW(2), SADDW(2).
+ case Instruction::Sub: { // USUBW(2), SSUBW(2).
// The second operand needs to be an extend
if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
if (!SrcTy)
SrcTy =
toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());
- } else
+ break;
+ }
+
+ if (Opcode == Instruction::Sub)
return false;
- break;
- case Instruction::Mul: { // SMULL(2), UMULL(2)
- // Both operands need to be extends of the same type.
- if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
- (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
+
+ // UADDW(2), SADDW(2) can be commutted.
+ if (isa<SExtInst>(Args[0]) || isa<ZExtInst>(Args[0])) {
if (!SrcTy)
SrcTy =
toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());
- } else if (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1])) {
- // If one of the operands is a Zext and the other has enough zero bits to
- // be treated as unsigned, we can still general a umull, meaning the zext
- // is free.
- KnownBits Known =
- computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
- if (Args[0]->getType()->getScalarSizeInBits() -
- Known.Zero.countLeadingOnes() >
- DstTy->getScalarSizeInBits() / 2)
- return false;
- if (!SrcTy)
- SrcTy = toVectorTy(Type::getIntNTy(DstTy->getContext(),
- DstTy->getScalarSizeInBits() / 2));
- } else
- return false;
- break;
+ break;
+ }
+ return false;
}
default:
return false;
@@ -3099,6 +3080,73 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
}
+Type *AArch64TTIImpl::isBinExtWideningInstruction(unsigned Opcode, Type *DstTy,
+ ArrayRef<const Value *> Args,
+ Type *SrcOverrideTy) const {
+ if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
+ Opcode != Instruction::Mul)
+ return nullptr;
+
+ // Exit early if DstTy is not a vector type whose elements are one of [i16,
+ // i32, i64]. SVE doesn't generally have the same set of instructions to
+ // perform an extend with the add/sub/mul. There are SMULLB style
+ // instructions, but they operate on top/bottom, requiring some sort of lane
+ // interleaving to be used with zext/sext.
+ unsigned DstEltSize = DstTy->getScalarSizeInBits();
+ if (!useNeonVector(DstTy) || Args.size() != 2 ||
+ (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
+ return nullptr;
+
+ auto getScalarSizeWithOverride = [&](const Value *V) {
+ if (SrcOverrideTy)
+ return SrcOverrideTy->getScalarSizeInBits();
+ return cast<Instruction>(V)
+ ->getOperand(0)
+ ->getType()
+ ->getScalarSizeInBits();
+ };
+
+ unsigned MaxEltSize = 0;
+ if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
+ (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
+ unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
+ unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
+ MaxEltSize = std::max(EltSize0, EltSize1);
+ } else if (isa<SExtInst, ZExtInst>(Args[0]) &&
+ isa<SExtInst, ZExtInst>(Args[1])) {
+ unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
+ unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
+ // mul(sext, zext) will become smull(sext, zext) if the extends are large
+ // enough.
+ if (EltSize0 >= DstEltSize / 2 || EltSize1 >= DstEltSize / 2)
+ return nullptr;
+ MaxEltSize = DstEltSize / 2;
+ } else if (Opcode == Instruction::Mul &&
+ (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1]))) {
+ // If one of the operands is a Zext and the other has enough zero bits
+ // to be treated as unsigned, we can still generate a umull, meaning the
+ // zext is free.
+ KnownBits Known =
+ computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
+ if (Args[0]->getType()->getScalarSizeInBits() -
+ Known.Zero.countLeadingOnes() >
+ DstTy->getScalarSizeInBits() / 2)
+ return nullptr;
+
+ MaxEltSize =
+ getScalarSizeWithOverride(isa<ZExtInst>(Args[0]) ? Args[0] : Args[1]);
+ } else
+ return nullptr;
+
+ if (MaxEltSize * 2 > DstEltSize)
+ return nullptr;
+
+ Type *ExtTy = DstTy->getWithNewBitWidth(MaxEltSize * 2);
+ if (ExtTy->getPrimitiveSizeInBits() <= 64)
+ return nullptr;
+ return ExtTy;
+}
+
// s/urhadd instructions implement the following pattern, making the
// extends free:
// %x = add ((zext i8 -> i16), 1)
@@ -3159,7 +3207,24 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
if (I && I->hasOneUser()) {
auto *SingleUser = cast<Instruction>(*I->user_begin());
SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
- if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands, Src)) {
+ if (Type *ExtTy = isBinExtWideningInstruction(
+ SingleUser->getOpcode(), Dst, Operands,
+ Src != I->getOperand(0)->getType() ? Src : nullptr)) {
+ // The cost from Src->Src*2 needs to be added if required, the cost from
+ // Src*2->ExtTy is free.
+ if (ExtTy->getScalarSizeInBits() > Src->getScalarSizeInBits() * 2) {
+ Type *DoubleSrcTy =
+ Src->getWithNewBitWidth(Src->getScalarSizeInBits() * 2);
+ return getCastInstrCost(Opcode, DoubleSrcTy, Src,
+ TTI::CastContextHint::None, CostKind);
+ }
+
+ return 0;
+ }
+
+ if (isSingleExtWideningInstruction(
+ SingleUser->getOpcode(), Dst, Operands,
+ Src != I->getOperand(0)->getType() ? Src : nullptr)) {
// For adds only count the second operand as free if both operands are
// extends but not the same operation. (i.e both operands are not free in
// add(sext, zext)).
@@ -3168,8 +3233,11 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
(isa<CastInst>(SingleUser->getOperand(1)) &&
cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
return 0;
- } else // Others are free so long as isWideningInstruction returned true.
+ } else {
+ // Others are free so long as isSingleExtWideningInstruction
+ // returned true.
return 0;
+ }
}
// The cast will be free for the s/urhadd instructions
@@ -4148,6 +4216,18 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
}))
return *PromotedCost;
+ // If the operation is a widening instruction (smull or umull) and both
+ // operands are extends the cost can be cheaper by considering that the
+ // operation will operate on the narrowest type size possible (double the
+ // largest input size) and a further extend.
+ if (Type *ExtTy = isBinExtWideningInstruction(Opcode, Ty, Args)) {
+ if (ExtTy != Ty)
+ return getArithmeticInstrCost(Opcode, ExtTy, CostKind) +
+ getCastInstrCost(Instruction::ZExt, Ty, ExtTy,
+ TTI::CastContextHint::None, CostKind);
+ return LT.first;
+ }
+
switch (ISD) {
default:
return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
@@ -4381,10 +4461,8 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
// - two 2-cost i64 inserts, and
// - two 1-cost muls.
// So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
- // LT.first = 2 the cost is 28. If both operands are extensions it will not
- // need to scalarize so the cost can be cheaper (smull or umull).
- // so the cost can be cheaper (smull or umull).
- if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args))
+ // LT.first = 2 the cost is 28.
+ if (LT.second != MVT::v2i64)
return LT.first;
return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() *
(getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind) +
@@ -6129,7 +6207,8 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
}
static bool containsDecreasingPointers(Loop *TheLoop,
- PredicatedScalarEvolution *PSE) {
+ PredicatedScalarEvolution *PSE,
+ const DominatorTree &DT) {
const auto &Strides = DenseMap<Value *, const SCEV *>();
for (BasicBlock *BB : TheLoop->blocks()) {
// Scan the instructions in the block and look for addresses that are
@@ -6138,8 +6217,8 @@ static bool containsDecreasingPointers(Loop *TheLoop,
if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
Value *Ptr = getLoadStorePointerOperand(&I);
Type *AccessTy = getLoadStoreType(&I);
- if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true,
- /*ShouldCheckWrap=*/false)
+ if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, DT, Strides,
+ /*Assume=*/true, /*ShouldCheckWrap=*/false)
.value_or(0) < 0)
return true;
}
@@ -6184,7 +6263,8 @@ bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) const {
// negative strides. This will require extra work to reverse the loop
// predicate, which may be expensive.
if (containsDecreasingPointers(TFI->LVL->getLoop(),
- TFI->LVL->getPredicatedScalarEvolution()))
+ TFI->LVL->getPredicatedScalarEvolution(),
+ *TFI->LVL->getDominatorTree()))
Required |= TailFoldingOpts::Reverse;
if (Required == TailFoldingOpts::Disabled)
Required |= TailFoldingOpts::Simple;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index b39546a..e62fdb6 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -59,9 +59,17 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
VECTOR_LDST_FOUR_ELEMENTS
};
- bool isWideningInstruction(Type *DstTy, unsigned Opcode,
- ArrayRef<const Value *> Args,
- Type *SrcOverrideTy = nullptr) const;
+ /// Given a add/sub/mul operation, detect a widening addl/subl/mull pattern
+ /// where both operands can be treated like extends. Returns the minimal type
+ /// needed to compute the operation.
+ Type *isBinExtWideningInstruction(unsigned Opcode, Type *DstTy,
+ ArrayRef<const Value *> Args,
+ Type *SrcOverrideTy = nullptr) const;
+ /// Given a add/sub operation with a single extend operand, detect a
+ /// widening addw/subw pattern.
+ bool isSingleExtWideningInstruction(unsigned Opcode, Type *DstTy,
+ ArrayRef<const Value *> Args,
+ Type *SrcOverrideTy = nullptr) const;
// A helper function called by 'getVectorInstrCost'.
//
@@ -304,7 +312,7 @@ public:
}
bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) const {
- if (!ST->hasSVE())
+ if (!ST->isSVEorStreamingSVEAvailable())
return false;
// For fixed vectors, avoid scalarization if using SVE for them.
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 5f93847..038ad77 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1809,6 +1809,9 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
return LowerBinOp(TargetOpcode::G_FMAXNUM);
case Intrinsic::aarch64_neon_fminnm:
return LowerBinOp(TargetOpcode::G_FMINNUM);
+ case Intrinsic::aarch64_neon_pmull:
+ case Intrinsic::aarch64_neon_pmull64:
+ return LowerBinOp(AArch64::G_PMULL);
case Intrinsic::aarch64_neon_smull:
return LowerBinOp(AArch64::G_SMULL);
case Intrinsic::aarch64_neon_umull:
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index 6d2d705..6b920f0 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -560,6 +560,7 @@ bool AArch64RegisterBankInfo::onlyUsesFP(const MachineInstr &MI,
case TargetOpcode::G_FCMP:
case TargetOpcode::G_LROUND:
case TargetOpcode::G_LLROUND:
+ case AArch64::G_PMULL:
return true;
case TargetOpcode::G_INTRINSIC:
switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 8669978..56ab040 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -38,9 +38,10 @@ enum ImplicitArgumentPositions {
#define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS,
enum ImplicitArgumentMask {
- NOT_IMPLICIT_INPUT = 0,
+ UNKNOWN_INTRINSIC = 0,
#include "AMDGPUAttributes.def"
- ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1
+ ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1,
+ NOT_IMPLICIT_INPUT
};
#define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str},
@@ -115,7 +116,7 @@ intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit,
NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
return QUEUE_PTR;
default:
- return NOT_IMPLICIT_INPUT;
+ return UNKNOWN_INTRINSIC;
}
}
@@ -534,6 +535,21 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
ImplicitArgumentMask AttrMask =
intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit,
HasApertureRegs, SupportsGetDoorbellID, COV);
+
+ if (AttrMask == UNKNOWN_INTRINSIC) {
+ // Assume not-nocallback intrinsics may invoke a function which accesses
+ // implicit arguments.
+ //
+ // FIXME: This isn't really the correct check. We want to ensure it
+ // isn't calling any function that may use implicit arguments regardless
+ // of whether it's internal to the module or not.
+ //
+ // TODO: Ignoring callsite attributes.
+ if (!Callee->hasFnAttribute(Attribute::NoCallback))
+ return indicatePessimisticFixpoint();
+ continue;
+ }
+
if (AttrMask != NOT_IMPLICIT_INPUT) {
if ((IsNonEntryFunc || !NonKernelOnly))
removeAssumedBits(AttrMask);
@@ -1357,7 +1373,10 @@ struct AAAMDGPUMinAGPRAlloc
default:
// Some intrinsics may use AGPRs, but if we have a choice, we are not
// required to use AGPRs.
- return true;
+
+ // Assume !nocallback intrinsics may call a function which requires
+ // AGPRs.
+ return CB.hasFnAttr(Attribute::NoCallback);
}
// TODO: Handle callsite attributes
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 0c97741..15ed60b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -116,8 +116,14 @@ bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
if (!DstRC || DstRC != SrcRC)
return false;
- return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
- RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
+ if (!RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) ||
+ !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
+ return false;
+ const MCInstrDesc &MCID = MI.getDesc();
+ if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
+ MI.getOperand(0).setIsEarlyClobber(true);
+ }
+ return true;
}
bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
@@ -602,6 +608,7 @@ bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
I.setDesc(TII.get(Opc));
I.addOperand(*MF, MachineOperand::CreateImm(0));
I.addImplicitDefUseOperands(*MF);
+ I.getOperand(0).setIsEarlyClobber(true);
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
@@ -3787,6 +3794,10 @@ bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
MI.removeOperand(1); // Intrinsic ID
MI.addOperand(VDst_In); // Readd VDst_In to the end
MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
+ const MCInstrDesc &MCID = MI.getDesc();
+ if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
+ MI.getOperand(0).setIsEarlyClobber(true);
+ }
return true;
}
@@ -6753,7 +6764,7 @@ bool AMDGPUInstructionSelector::selectSGetBarrierState(
MachineInstr &I, Intrinsic::ID IntrID) const {
MachineBasicBlock *MBB = I.getParent();
const DebugLoc &DL = I.getDebugLoc();
- MachineOperand BarOp = I.getOperand(2);
+ const MachineOperand &BarOp = I.getOperand(2);
std::optional<int64_t> BarValImm =
getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
@@ -6806,8 +6817,8 @@ bool AMDGPUInstructionSelector::selectNamedBarrierInit(
MachineInstr &I, Intrinsic::ID IntrID) const {
MachineBasicBlock *MBB = I.getParent();
const DebugLoc &DL = I.getDebugLoc();
- MachineOperand BarOp = I.getOperand(1);
- MachineOperand CntOp = I.getOperand(2);
+ const MachineOperand &BarOp = I.getOperand(1);
+ const MachineOperand &CntOp = I.getOperand(2);
// BarID = (BarOp >> 4) & 0x3F
Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 0a59132..fdff21b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -1565,8 +1565,11 @@ void SplitPtrStructs::processConditionals() {
} else if (isa<SelectInst>(I)) {
if (MaybeRsrc) {
if (auto *RsrcInst = dyn_cast<Instruction>(Rsrc)) {
- ConditionalTemps.push_back(RsrcInst);
- RsrcInst->replaceAllUsesWith(*MaybeRsrc);
+ // Guard against conditionals that were already folded away.
+ if (RsrcInst != *MaybeRsrc) {
+ ConditionalTemps.push_back(RsrcInst);
+ RsrcInst->replaceAllUsesWith(*MaybeRsrc);
+ }
}
for (Value *V : Seen)
FoundRsrcs[V] = *MaybeRsrc;
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 52cc4ca..1a14629 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -435,7 +435,7 @@ void GCNHazardRecognizer::RecedeCycle() {
// Helper Functions
//===----------------------------------------------------------------------===//
-using HazardFnResult = enum { HazardFound, HazardExpired, NoHazardFound };
+enum HazardFnResult { HazardFound, HazardExpired, NoHazardFound };
using IsExpiredFn = function_ref<bool(const MachineInstr &, int WaitStates)>;
using GetNumWaitStatesFn = function_ref<unsigned int(const MachineInstr &)>;
diff --git a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
index 959ce69..1682abb 100644
--- a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
@@ -43,7 +43,7 @@ public:
bool run(MachineFunction &MF);
private:
- using NSA_Status = enum {
+ enum NSA_Status {
NOT_NSA, // Not an NSA instruction
FIXED, // NSA which we cannot modify
NON_CONTIGUOUS, // NSA with non-sequential address which we can try
diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index 4deb2a9..62172a0 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -136,7 +136,7 @@ bool GCNPreRAOptimizationsImpl::processReg(Register Reg) {
continue;
if (Def.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
- MachineOperand DefSrcMO = Def.getOperand(1);
+ const MachineOperand &DefSrcMO = Def.getOperand(1);
// Immediates are not an issue and can be propagated in
// postrapseudos pass. Only handle cases where defining
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index 2aa54c9..31eca04 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -1129,12 +1129,9 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
if ((AS == AMDGPUAS::PRIVATE_ADDRESS) && TruncatingStore) {
// Add an extra level of chain to isolate this vector
SDValue NewChain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, Chain);
- // TODO: can the chain be replaced without creating a new store?
- SDValue NewStore = DAG.getTruncStore(
- NewChain, DL, Value, Ptr, StoreNode->getPointerInfo(), MemVT,
- StoreNode->getAlign(), StoreNode->getMemOperand()->getFlags(),
- StoreNode->getAAInfo());
- StoreNode = cast<StoreSDNode>(NewStore);
+ SmallVector<SDValue, 4> NewOps(StoreNode->ops());
+ NewOps[0] = NewChain;
+ StoreNode = cast<StoreSDNode>(DAG.UpdateNodeOperands(StoreNode, NewOps));
}
return scalarizeVectorStore(StoreNode, DAG);
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 6616b30..84984a0 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1129,40 +1129,11 @@ bool SIFoldOperandsImpl::tryToFoldACImm(
if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx))
return false;
- MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
if (OpToFold.isImm() && OpToFold.isOperandLegal(*TII, *UseMI, UseOpIdx)) {
appendFoldCandidate(FoldList, UseMI, UseOpIdx, OpToFold);
return true;
}
- // TODO: Verify the following code handles subregisters correctly.
- // TODO: Handle extract of global reference
- if (UseOp.getSubReg())
- return false;
-
- if (!OpToFold.isReg())
- return false;
-
- Register UseReg = OpToFold.getReg();
- if (!UseReg.isVirtual())
- return false;
-
- // Maybe it is just a COPY of an immediate itself.
-
- // FIXME: Remove this handling. There is already special case folding of
- // immediate into copy in foldOperand. This is looking for the def of the
- // value the folding started from in the first place.
- MachineInstr *Def = MRI->getVRegDef(UseReg);
- if (Def && TII->isFoldableCopy(*Def)) {
- MachineOperand &DefOp = Def->getOperand(1);
- if (DefOp.isImm() && TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) {
- FoldableDef FoldableImm(DefOp.getImm(), OpToFold.DefRC,
- OpToFold.DefSubReg);
- appendFoldCandidate(FoldList, UseMI, UseOpIdx, FoldableImm);
- return true;
- }
- }
-
return false;
}
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 5c39f7a..aa5ea77 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -2170,7 +2170,9 @@ bool SIFrameLowering::hasFPImpl(const MachineFunction &MF) const {
return MFI.getStackSize() != 0;
}
- return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() ||
+ return (frameTriviallyRequiresSP(MFI) &&
+ !MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) ||
+ MFI.isFrameAddressTaken() ||
MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment(
MF) ||
mayReserveScratchForCWSR(MF) ||
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 45f5919..9460145 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7945,7 +7945,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
}
legalizeOperands(*NewInstr, MDT);
int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
- MachineOperand SCCOp = Inst.getOperand(SCCIdx);
+ const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
Inst.eraseFromParent();
return;
@@ -7985,7 +7985,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
legalizeOperandsVALUt16(*NewInstr, MRI);
legalizeOperands(*NewInstr, MDT);
int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
- MachineOperand SCCOp = Inst.getOperand(SCCIdx);
+ const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
Inst.eraseFromParent();
return;
@@ -8183,7 +8183,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
AMDGPU::OpName::src0_modifiers) >= 0)
NewInstr.addImm(0);
if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
- MachineOperand Src = Inst.getOperand(1);
+ const MachineOperand &Src = Inst.getOperand(1);
NewInstr->addOperand(Src);
}
@@ -9199,7 +9199,7 @@ void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
}
-void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
+void SIInstrInfo::addSCCDefUsersToVALUWorklist(const MachineOperand &Op,
MachineInstr &SCCDefInst,
SIInstrWorklist &Worklist,
Register NewCond) const {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index dc23a21..0643b53 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -172,7 +172,7 @@ private:
void addUsersToMoveToVALUWorklist(Register Reg, MachineRegisterInfo &MRI,
SIInstrWorklist &Worklist) const;
- void addSCCDefUsersToVALUWorklist(MachineOperand &Op,
+ void addSCCDefUsersToVALUWorklist(const MachineOperand &Op,
MachineInstr &SCCDefInst,
SIInstrWorklist &Worklist,
Register NewCond = Register()) const;
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index 7431e11..8785968 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -296,7 +296,7 @@ bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
for (MachineBasicBlock::instr_iterator I = std::next(First.getIterator()),
E = MI.getIterator();
I != E; ++I) {
- if (I->isBundle())
+ if (I->isBundle() || I->isDebugInstr())
continue;
switch (I->getOpcode()) {
case AMDGPU::S_SET_GPR_IDX_MODE:
@@ -640,7 +640,7 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
}
void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) {
- MachineOperand DstOp = I.getOperand(0);
+ const MachineOperand &DstOp = I.getOperand(0);
uint16_t UnpackedOpcode = mapToUnpackedOpcode(I);
assert(UnpackedOpcode != std::numeric_limits<uint16_t>::max() &&
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 4ae2c1e..31d8bce4 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -1707,7 +1707,7 @@ multiclass WMMAInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string Pse
defvar WMMAConstraints2Addr = !if(DiffVdstSrc2, "@earlyclobber $vdst", "@earlyclobber $vdst,$vdst = $src2");
defvar WMMAConstraints3Addr = "@earlyclobber $vdst";
- let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
+ let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0, isConvergent = 1 in {
let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in
def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>, WMMAInstInfo {
let PseudoInstr = Instr#PseudoInstrSuffix;
@@ -1734,7 +1734,7 @@ multiclass SWMMACInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string P
let mayRaiseFPException = 0;
let ReadsModeReg = 0;
let AsmMatchConverter = "cvtSWMMAC";
-
+ let isConvergent = 1;
let Constraints = "@earlyclobber $vdst,$vdst = $srcTiedDef";
}
}
@@ -1906,8 +1906,10 @@ defm V_WMMA_SCALE_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_scale_f32_32x16
defm V_WMMA_SCALE16_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_scale16_f32_32x16x128_f4", F32_32X16X128_F4_SCALE16_w32, "_w32">;
} // End is_wmma_xdl = 1.
-defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3PInst<"v_wmma_ld_scale_paired_b32", VOP_WMMA_LD_SCALE<i32, VCSrc_b32_Lo256>>;
-defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3PInst<"v_wmma_ld_scale16_paired_b64", VOP_WMMA_LD_SCALE<i64, VCSrc_b64_Lo256>>;
+let isConvergent = 1 in {
+ defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3PInst<"v_wmma_ld_scale_paired_b32", VOP_WMMA_LD_SCALE<i32, VCSrc_b32_Lo256>>;
+ defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3PInst<"v_wmma_ld_scale16_paired_b64", VOP_WMMA_LD_SCALE<i64, VCSrc_b64_Lo256>>;
+}
} // End SubtargetPredicate = isGFX125xOnly
} // End WaveSizePredicate = isWave32
diff --git a/llvm/lib/Target/ARM/ARMFastISel.cpp b/llvm/lib/Target/ARM/ARMFastISel.cpp
index 14e1160..88d3b6f 100644
--- a/llvm/lib/Target/ARM/ARMFastISel.cpp
+++ b/llvm/lib/Target/ARM/ARMFastISel.cpp
@@ -86,7 +86,7 @@ namespace {
// All possible address modes, plus some.
class Address {
public:
- using BaseKind = enum { RegBase, FrameIndexBase };
+ enum BaseKind { RegBase, FrameIndexBase };
private:
BaseKind Kind = RegBase;
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 6b06534..92fae71 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -1312,8 +1312,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom);
}
- setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
- setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
+ setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
// FP-ARMv8 implements a lot of rounding-like FP operations.
if (Subtarget->hasFPARMv8Base()) {
@@ -9855,76 +9855,6 @@ static SDValue LowerUADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) {
return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry);
}
-SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
- // For iOS, we want to call an alternative entry point: __sincos_stret,
- // return values are passed via sret.
- SDLoc dl(Op);
- SDValue Arg = Op.getOperand(0);
- EVT ArgVT = Arg.getValueType();
- RTLIB::Libcall LC = RTLIB::getSINCOS_STRET(ArgVT);
- RTLIB::LibcallImpl SincosStret = getLibcallImpl(LC);
- if (SincosStret == RTLIB::Unsupported)
- return SDValue();
-
- assert(Subtarget->isTargetDarwin());
-
- Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
- auto PtrVT = getPointerTy(DAG.getDataLayout());
-
- MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
-
- // Pair of floats / doubles used to pass the result.
- Type *RetTy = StructType::get(ArgTy, ArgTy);
- auto &DL = DAG.getDataLayout();
-
- ArgListTy Args;
- bool ShouldUseSRet = getTM().isAPCS_ABI();
- SDValue SRet;
- if (ShouldUseSRet) {
- // Create stack object for sret.
- const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);
- const Align StackAlign = DL.getPrefTypeAlign(RetTy);
- int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
- SRet = DAG.getFrameIndex(FrameIdx, getPointerTy(DL));
-
- ArgListEntry Entry(SRet, PointerType::getUnqual(RetTy->getContext()));
- Entry.IsSExt = false;
- Entry.IsZExt = false;
- Entry.IsSRet = true;
- Args.push_back(Entry);
- RetTy = Type::getVoidTy(*DAG.getContext());
- }
-
- Args.emplace_back(Arg, ArgTy);
-
- StringRef LibcallName = getLibcallImplName(SincosStret);
- CallingConv::ID CC = getLibcallImplCallingConv(SincosStret);
- SDValue Callee = DAG.getExternalSymbol(LibcallName.data(), getPointerTy(DL));
-
- TargetLowering::CallLoweringInfo CLI(DAG);
- CLI.setDebugLoc(dl)
- .setChain(DAG.getEntryNode())
- .setCallee(CC, RetTy, Callee, std::move(Args))
- .setDiscardResult(ShouldUseSRet);
- std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
-
- if (!ShouldUseSRet)
- return CallResult.first;
-
- SDValue LoadSin =
- DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo());
-
- // Address of cos field.
- SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet,
- DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl));
- SDValue LoadCos =
- DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo());
-
- SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
- return DAG.getNode(ISD::MERGE_VALUES, dl, Tys,
- LoadSin.getValue(0), LoadCos.getValue(0));
-}
-
SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
bool Signed,
SDValue &Chain) const {
@@ -10726,8 +10656,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::VECREDUCE_SMAX:
return LowerVecReduceMinMax(Op, DAG, Subtarget);
case ISD::ATOMIC_LOAD:
- case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG);
- case ISD::FSINCOS: return LowerFSINCOS(Op, DAG);
+ case ISD::ATOMIC_STORE:
+ return LowerAtomicLoadStore(Op, DAG);
case ISD::SDIVREM:
case ISD::UDIVREM: return LowerDivRem(Op, DAG);
case ISD::DYNAMIC_STACKALLOC:
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index bf3438b..bc2fec3 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -901,7 +901,6 @@ class VectorType;
SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
const ARMSubtarget *ST) const;
SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDivRem(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed) const;
void ExpandDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed,
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 9b250e6..24f58a6 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -2448,7 +2448,8 @@ static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
//
static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
const DataLayout &DL,
- const LoopAccessInfo *LAI) {
+ const LoopAccessInfo *LAI,
+ const DominatorTree &DT) {
LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
// If there are live-out values, it is probably a reduction. We can predicate
@@ -2498,7 +2499,8 @@ static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
Value *Ptr = getLoadStorePointerOperand(&I);
Type *AccessTy = getLoadStoreType(&I);
- int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, L).value_or(0);
+ int64_t NextStride =
+ getPtrStride(PSE, AccessTy, Ptr, L, DT).value_or(0);
if (NextStride == 1) {
// TODO: for now only allow consecutive strides of 1. We could support
// other strides as long as it is uniform, but let's keep it simple
@@ -2585,7 +2587,8 @@ bool ARMTTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) const {
return false;
}
- return canTailPredicateLoop(L, LI, *SE, DL, LVL->getLAI());
+ return canTailPredicateLoop(L, LI, *SE, DL, LVL->getLAI(),
+ *LVL->getDominatorTree());
}
TailFoldingStyle
diff --git a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
index b2a8204..abe081c 100644
--- a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
+++ b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
@@ -176,10 +176,6 @@ void BPFAsmPrinter::emitInstruction(const MachineInstr *MI) {
if (const GlobalValue *GV = Op.getGlobal())
if (GV->getName() == BPF_TRAP)
SawTrapCall = true;
- } else if (Op.isSymbol()) {
- if (const MCSymbol *Sym = Op.getMCSymbol())
- if (Sym->getName() == BPF_TRAP)
- SawTrapCall = true;
}
}
}
@@ -219,6 +215,10 @@ void BPFAsmPrinter::emitJumpTableInfo() {
const TargetLoweringObjectFile &TLOF = getObjFileLowering();
const Function &F = MF->getFunction();
+
+ MCSection *Sec = OutStreamer->getCurrentSectionOnly();
+ MCSymbol *SecStart = Sec->getBeginSymbol();
+
MCSection *JTS = TLOF.getSectionForJumpTable(F, TM);
assert(MJTI->getEntryKind() == MachineJumpTableInfo::EK_BlockAddress);
unsigned EntrySize = MJTI->getEntrySize(getDataLayout());
@@ -231,8 +231,10 @@ void BPFAsmPrinter::emitJumpTableInfo() {
MCSymbol *JTStart = getJTPublicSymbol(JTI);
OutStreamer->emitLabel(JTStart);
for (const MachineBasicBlock *MBB : JTBBs) {
- const MCExpr *LHS = MCSymbolRefExpr::create(MBB->getSymbol(), OutContext);
- OutStreamer->emitValue(LHS, EntrySize);
+ const MCExpr *Diff = MCBinaryExpr::createSub(
+ MCSymbolRefExpr::create(MBB->getSymbol(), OutContext),
+ MCSymbolRefExpr::create(SecStart, OutContext), OutContext);
+ OutStreamer->emitValue(Diff, EntrySize);
}
const MCExpr *JTSize =
MCConstantExpr::create(JTBBs.size() * EntrySize, OutContext);
diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
index eb4c884..677203d 100644
--- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
+++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
@@ -285,6 +285,13 @@ void DXContainerGlobals::addPipelineStateValidationInfo(
PSV.BaseData.NumThreadsX = MMI.EntryPropertyVec[0].NumThreadsX;
PSV.BaseData.NumThreadsY = MMI.EntryPropertyVec[0].NumThreadsY;
PSV.BaseData.NumThreadsZ = MMI.EntryPropertyVec[0].NumThreadsZ;
+ if (MMI.EntryPropertyVec[0].WaveSizeMin) {
+ PSV.BaseData.MinimumWaveLaneCount = MMI.EntryPropertyVec[0].WaveSizeMin;
+ PSV.BaseData.MaximumWaveLaneCount =
+ MMI.EntryPropertyVec[0].WaveSizeMax
+ ? MMI.EntryPropertyVec[0].WaveSizeMax
+ : MMI.EntryPropertyVec[0].WaveSizeMin;
+ }
break;
default:
break;
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index 7ae500a..67437f6 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -1079,6 +1079,15 @@ def WaveActiveOp : DXILOp<119, waveActiveOp> {
let attributes = [Attributes<DXIL1_0, []>];
}
+def LegacyF16ToF32 : DXILOp<131, legacyF16ToF32> {
+ let Doc = "returns the float16 stored in the low-half of the uint converted "
+ "to a float";
+ let intrinsics = [IntrinSelect<int_dx_legacyf16tof32>];
+ let arguments = [Int32Ty];
+ let result = FloatTy;
+ let stages = [Stages<DXIL1_0, [all_stages]>];
+}
+
def WaveAllBitCount : DXILOp<135, waveAllOp> {
let Doc = "returns the count of bits set to 1 across the wave";
let intrinsics = [IntrinSelect<int_dx_wave_active_countbits>];
diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
index cf8b833..e1a472f 100644
--- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
+++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
@@ -82,6 +82,7 @@ enum class EntryPropsTag {
ASStateTag,
WaveSize,
EntryRootSig,
+ WaveRange = 23,
};
} // namespace
@@ -177,14 +178,15 @@ getTagValueAsMetadata(EntryPropsTag Tag, uint64_t Value, LLVMContext &Ctx) {
case EntryPropsTag::ASStateTag:
case EntryPropsTag::WaveSize:
case EntryPropsTag::EntryRootSig:
+ case EntryPropsTag::WaveRange:
llvm_unreachable("NYI: Unhandled entry property tag");
}
return MDVals;
}
-static MDTuple *
-getEntryPropAsMetadata(const EntryProperties &EP, uint64_t EntryShaderFlags,
- const Triple::EnvironmentType ShaderProfile) {
+static MDTuple *getEntryPropAsMetadata(Module &M, const EntryProperties &EP,
+ uint64_t EntryShaderFlags,
+ const ModuleMetadataInfo &MMDI) {
SmallVector<Metadata *> MDVals;
LLVMContext &Ctx = EP.Entry->getContext();
if (EntryShaderFlags != 0)
@@ -195,12 +197,13 @@ getEntryPropAsMetadata(const EntryProperties &EP, uint64_t EntryShaderFlags,
// FIXME: support more props.
// See https://github.com/llvm/llvm-project/issues/57948.
// Add shader kind for lib entries.
- if (ShaderProfile == Triple::EnvironmentType::Library &&
+ if (MMDI.ShaderProfile == Triple::EnvironmentType::Library &&
EP.ShaderStage != Triple::EnvironmentType::Library)
MDVals.append(getTagValueAsMetadata(EntryPropsTag::ShaderKind,
getShaderStage(EP.ShaderStage), Ctx));
if (EP.ShaderStage == Triple::EnvironmentType::Compute) {
+ // Handle mandatory "hlsl.numthreads"
MDVals.emplace_back(ConstantAsMetadata::get(ConstantInt::get(
Type::getInt32Ty(Ctx), static_cast<int>(EntryPropsTag::NumThreads))));
Metadata *NumThreadVals[] = {ConstantAsMetadata::get(ConstantInt::get(
@@ -210,8 +213,48 @@ getEntryPropAsMetadata(const EntryProperties &EP, uint64_t EntryShaderFlags,
ConstantAsMetadata::get(ConstantInt::get(
Type::getInt32Ty(Ctx), EP.NumThreadsZ))};
MDVals.emplace_back(MDNode::get(Ctx, NumThreadVals));
+
+ // Handle optional "hlsl.wavesize". The fields are optionally represented
+ // if they are non-zero.
+ if (EP.WaveSizeMin != 0) {
+ bool IsWaveRange = VersionTuple(6, 8) <= MMDI.ShaderModelVersion;
+ bool IsWaveSize =
+ !IsWaveRange && VersionTuple(6, 6) <= MMDI.ShaderModelVersion;
+
+ if (!IsWaveRange && !IsWaveSize) {
+ reportError(M, "Shader model 6.6 or greater is required to specify "
+ "the \"hlsl.wavesize\" function attribute");
+ return nullptr;
+ }
+
+ // A range is being specified if EP.WaveSizeMax != 0
+ if (EP.WaveSizeMax && !IsWaveRange) {
+ reportError(
+ M, "Shader model 6.8 or greater is required to specify "
+ "wave size range values of the \"hlsl.wavesize\" function "
+ "attribute");
+ return nullptr;
+ }
+
+ EntryPropsTag Tag =
+ IsWaveSize ? EntryPropsTag::WaveSize : EntryPropsTag::WaveRange;
+ MDVals.emplace_back(ConstantAsMetadata::get(
+ ConstantInt::get(Type::getInt32Ty(Ctx), static_cast<int>(Tag))));
+
+ SmallVector<Metadata *> WaveSizeVals = {ConstantAsMetadata::get(
+ ConstantInt::get(Type::getInt32Ty(Ctx), EP.WaveSizeMin))};
+ if (IsWaveRange) {
+ WaveSizeVals.push_back(ConstantAsMetadata::get(
+ ConstantInt::get(Type::getInt32Ty(Ctx), EP.WaveSizeMax)));
+ WaveSizeVals.push_back(ConstantAsMetadata::get(
+ ConstantInt::get(Type::getInt32Ty(Ctx), EP.WaveSizePref)));
+ }
+
+ MDVals.emplace_back(MDNode::get(Ctx, WaveSizeVals));
+ }
}
}
+
if (MDVals.empty())
return nullptr;
return MDNode::get(Ctx, MDVals);
@@ -236,12 +279,11 @@ static MDTuple *constructEntryMetadata(const Function *EntryFn,
return MDNode::get(Ctx, MDVals);
}
-static MDTuple *emitEntryMD(const EntryProperties &EP, MDTuple *Signatures,
- MDNode *MDResources,
+static MDTuple *emitEntryMD(Module &M, const EntryProperties &EP,
+ MDTuple *Signatures, MDNode *MDResources,
const uint64_t EntryShaderFlags,
- const Triple::EnvironmentType ShaderProfile) {
- MDTuple *Properties =
- getEntryPropAsMetadata(EP, EntryShaderFlags, ShaderProfile);
+ const ModuleMetadataInfo &MMDI) {
+ MDTuple *Properties = getEntryPropAsMetadata(M, EP, EntryShaderFlags, MMDI);
return constructEntryMetadata(EP.Entry, Signatures, MDResources, Properties,
EP.Entry->getContext());
}
@@ -523,10 +565,8 @@ static void translateGlobalMetadata(Module &M, DXILResourceMap &DRM,
Twine(Triple::getEnvironmentTypeName(MMDI.ShaderProfile) +
"'"));
}
-
- EntryFnMDNodes.emplace_back(emitEntryMD(EntryProp, Signatures, ResourceMD,
- EntryShaderFlags,
- MMDI.ShaderProfile));
+ EntryFnMDNodes.emplace_back(emitEntryMD(
+ M, EntryProp, Signatures, ResourceMD, EntryShaderFlags, MMDI));
}
NamedMDNode *EntryPointsNamedMD =
diff --git a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
index 60dfd96..6cacbf6 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
@@ -29,11 +29,12 @@ bool DirectXTTIImpl::isTargetIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID,
int OpdIdx) const {
switch (ID) {
case Intrinsic::dx_asdouble:
- case Intrinsic::dx_isinf:
- case Intrinsic::dx_isnan:
case Intrinsic::dx_firstbitlow:
- case Intrinsic::dx_firstbituhigh:
case Intrinsic::dx_firstbitshigh:
+ case Intrinsic::dx_firstbituhigh:
+ case Intrinsic::dx_isinf:
+ case Intrinsic::dx_isnan:
+ case Intrinsic::dx_legacyf16tof32:
return OpdIdx == 0;
default:
return OpdIdx == -1;
@@ -50,6 +51,7 @@ bool DirectXTTIImpl::isTargetIntrinsicTriviallyScalarizable(
case Intrinsic::dx_frac:
case Intrinsic::dx_isinf:
case Intrinsic::dx_isnan:
+ case Intrinsic::dx_legacyf16tof32:
case Intrinsic::dx_rsqrt:
case Intrinsic::dx_saturate:
case Intrinsic::dx_splitdouble:
diff --git a/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp b/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp
index 479ac90..f29a739 100644
--- a/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp
@@ -104,13 +104,6 @@ const std::map<unsigned short, unsigned short> QFPInstMap{
{Hexagon::V6_vmpy_qf32_sf, Hexagon::V6_vmpy_qf32}};
} // namespace
-namespace llvm {
-
-FunctionPass *createHexagonQFPOptimizer();
-void initializeHexagonQFPOptimizerPass(PassRegistry &);
-
-} // namespace llvm
-
namespace {
struct HexagonQFPOptimizer : public MachineFunctionPass {
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index fe700e1..cf4ffc82 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -6630,6 +6630,11 @@ performINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(LoongArchISD::VANY_NONZERO, DL, N->getValueType(0),
N->getOperand(1));
break;
+ case Intrinsic::loongarch_lasx_concat_128_s:
+ case Intrinsic::loongarch_lasx_concat_128_d:
+ case Intrinsic::loongarch_lasx_concat_128:
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, N->getValueType(0),
+ N->getOperand(1), N->getOperand(2));
}
return SDValue();
}
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
index c89212d..90a4723 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
@@ -756,6 +756,155 @@ LoongArchInstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
return ArrayRef(TargetFlags);
}
+bool LoongArchInstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI,
+ Register Reg,
+ const MachineInstr &AddrI,
+ ExtAddrMode &AM) const {
+ enum MemIOffsetType {
+ Imm14Shift2,
+ Imm12,
+ Imm11Shift1,
+ Imm10Shift2,
+ Imm9Shift3,
+ Imm8,
+ Imm8Shift1,
+ Imm8Shift2,
+ Imm8Shift3
+ };
+
+ MemIOffsetType OT;
+ switch (MemI.getOpcode()) {
+ default:
+ return false;
+ case LoongArch::LDPTR_W:
+ case LoongArch::LDPTR_D:
+ case LoongArch::STPTR_W:
+ case LoongArch::STPTR_D:
+ OT = Imm14Shift2;
+ break;
+ case LoongArch::LD_B:
+ case LoongArch::LD_H:
+ case LoongArch::LD_W:
+ case LoongArch::LD_D:
+ case LoongArch::LD_BU:
+ case LoongArch::LD_HU:
+ case LoongArch::LD_WU:
+ case LoongArch::ST_B:
+ case LoongArch::ST_H:
+ case LoongArch::ST_W:
+ case LoongArch::ST_D:
+ case LoongArch::FLD_S:
+ case LoongArch::FLD_D:
+ case LoongArch::FST_S:
+ case LoongArch::FST_D:
+ case LoongArch::VLD:
+ case LoongArch::VST:
+ case LoongArch::XVLD:
+ case LoongArch::XVST:
+ case LoongArch::VLDREPL_B:
+ case LoongArch::XVLDREPL_B:
+ OT = Imm12;
+ break;
+ case LoongArch::VLDREPL_H:
+ case LoongArch::XVLDREPL_H:
+ OT = Imm11Shift1;
+ break;
+ case LoongArch::VLDREPL_W:
+ case LoongArch::XVLDREPL_W:
+ OT = Imm10Shift2;
+ break;
+ case LoongArch::VLDREPL_D:
+ case LoongArch::XVLDREPL_D:
+ OT = Imm9Shift3;
+ break;
+ case LoongArch::VSTELM_B:
+ case LoongArch::XVSTELM_B:
+ OT = Imm8;
+ break;
+ case LoongArch::VSTELM_H:
+ case LoongArch::XVSTELM_H:
+ OT = Imm8Shift1;
+ break;
+ case LoongArch::VSTELM_W:
+ case LoongArch::XVSTELM_W:
+ OT = Imm8Shift2;
+ break;
+ case LoongArch::VSTELM_D:
+ case LoongArch::XVSTELM_D:
+ OT = Imm8Shift3;
+ break;
+ }
+
+ if (MemI.getOperand(0).getReg() == Reg)
+ return false;
+
+ if ((AddrI.getOpcode() != LoongArch::ADDI_W &&
+ AddrI.getOpcode() != LoongArch::ADDI_D) ||
+ !AddrI.getOperand(1).isReg() || !AddrI.getOperand(2).isImm())
+ return false;
+
+ int64_t OldOffset = MemI.getOperand(2).getImm();
+ int64_t Disp = AddrI.getOperand(2).getImm();
+ int64_t NewOffset = OldOffset + Disp;
+ if (!STI.is64Bit())
+ NewOffset = SignExtend64<32>(NewOffset);
+
+ if (!(OT == Imm14Shift2 && isShiftedInt<14, 2>(NewOffset) && STI.hasUAL()) &&
+ !(OT == Imm12 && isInt<12>(NewOffset)) &&
+ !(OT == Imm11Shift1 && isShiftedInt<11, 1>(NewOffset)) &&
+ !(OT == Imm10Shift2 && isShiftedInt<10, 2>(NewOffset)) &&
+ !(OT == Imm9Shift3 && isShiftedInt<9, 3>(NewOffset)) &&
+ !(OT == Imm8 && isInt<8>(NewOffset)) &&
+ !(OT == Imm8Shift1 && isShiftedInt<8, 1>(NewOffset)) &&
+ !(OT == Imm8Shift2 && isShiftedInt<8, 2>(NewOffset)) &&
+ !(OT == Imm8Shift3 && isShiftedInt<8, 3>(NewOffset)))
+ return false;
+
+ AM.BaseReg = AddrI.getOperand(1).getReg();
+ AM.ScaledReg = 0;
+ AM.Scale = 0;
+ AM.Displacement = NewOffset;
+ AM.Form = ExtAddrMode::Formula::Basic;
+ return true;
+}
+
+MachineInstr *
+LoongArchInstrInfo::emitLdStWithAddr(MachineInstr &MemI,
+ const ExtAddrMode &AM) const {
+ const DebugLoc &DL = MemI.getDebugLoc();
+ MachineBasicBlock &MBB = *MemI.getParent();
+
+ assert(AM.ScaledReg == 0 && AM.Scale == 0 &&
+ "Addressing mode not supported for folding");
+
+ unsigned MemIOp = MemI.getOpcode();
+ switch (MemIOp) {
+ default:
+ return BuildMI(MBB, MemI, DL, get(MemIOp))
+ .addReg(MemI.getOperand(0).getReg(),
+ MemI.mayLoad() ? RegState::Define : 0)
+ .addReg(AM.BaseReg)
+ .addImm(AM.Displacement)
+ .setMemRefs(MemI.memoperands())
+ .setMIFlags(MemI.getFlags());
+ case LoongArch::VSTELM_B:
+ case LoongArch::VSTELM_H:
+ case LoongArch::VSTELM_W:
+ case LoongArch::VSTELM_D:
+ case LoongArch::XVSTELM_B:
+ case LoongArch::XVSTELM_H:
+ case LoongArch::XVSTELM_W:
+ case LoongArch::XVSTELM_D:
+ return BuildMI(MBB, MemI, DL, get(MemIOp))
+ .addReg(MemI.getOperand(0).getReg(), 0)
+ .addReg(AM.BaseReg)
+ .addImm(AM.Displacement)
+ .addImm(MemI.getOperand(3).getImm())
+ .setMemRefs(MemI.memoperands())
+ .setMIFlags(MemI.getFlags());
+ }
+}
+
// Returns true if this is the sext.w pattern, addi.w rd, rs, 0.
bool LoongArch::isSEXT_W(const MachineInstr &MI) {
return MI.getOpcode() == LoongArch::ADDI_W && MI.getOperand(1).isReg() &&
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
index f25958a..f69a558 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
@@ -93,6 +93,12 @@ public:
ArrayRef<std::pair<unsigned, const char *>>
getSerializableBitmaskMachineOperandTargetFlags() const override;
+ bool canFoldIntoAddrMode(const MachineInstr &MemI, Register Reg,
+ const MachineInstr &AddrI,
+ ExtAddrMode &AM) const override;
+ MachineInstr *emitLdStWithAddr(MachineInstr &MemI,
+ const ExtAddrMode &AM) const override;
+
protected:
const LoongArchSubtarget &STI;
};
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index b502b056..00d5287 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -2113,6 +2113,37 @@ defm : subvector_subreg_lowering<LSX128, v2f64, LASX256, v4f64, 2, sub_128>;
defm : subvector_subreg_lowering<LSX128, v8i16, LASX256, v16i16, 8, sub_128>;
defm : subvector_subreg_lowering<LSX128, v16i8, LASX256, v32i8, 16, sub_128>;
+// LASX and LSX conversion
+def : Pat<(int_loongarch_lasx_cast_128_s (v4f32 LSX128:$src)),
+ (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$src, sub_128)>;
+def : Pat<(int_loongarch_lasx_cast_128_d (v2f64 LSX128:$src)),
+ (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$src, sub_128)>;
+def : Pat<(int_loongarch_lasx_cast_128 (v2i64 LSX128:$src)),
+ (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$src, sub_128)>;
+def : Pat<(int_loongarch_lasx_extract_128_lo_s (v8f32 LASX256:$src)),
+ (EXTRACT_SUBREG LASX256:$src, sub_128)>;
+def : Pat<(int_loongarch_lasx_extract_128_lo_d (v4f64 LASX256:$src)),
+ (EXTRACT_SUBREG LASX256:$src, sub_128)>;
+def : Pat<(int_loongarch_lasx_extract_128_lo (v4i64 LASX256:$src)),
+ (EXTRACT_SUBREG LASX256:$src, sub_128)>;
+def : Pat<(int_loongarch_lasx_extract_128_hi_s (v8f32 LASX256:$src)),
+ (EXTRACT_SUBREG (XVPERMI_Q (IMPLICIT_DEF), LASX256:$src, 1), sub_128)>;
+def : Pat<(int_loongarch_lasx_extract_128_hi_d (v4f64 LASX256:$src)),
+ (EXTRACT_SUBREG (XVPERMI_Q (IMPLICIT_DEF), LASX256:$src, 1), sub_128)>;
+def : Pat<(int_loongarch_lasx_extract_128_hi (v4i64 LASX256:$src)),
+ (EXTRACT_SUBREG (XVPERMI_Q (IMPLICIT_DEF), LASX256:$src, 1), sub_128)>;
+def : Pat<(int_loongarch_lasx_insert_128_lo_s (v8f32 LASX256:$src), (v4f32 LSX128:$lo)),
+ (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 48)>;
+def : Pat<(int_loongarch_lasx_insert_128_lo_d (v4f64 LASX256:$src), (v2f64 LSX128:$lo)),
+ (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 48)>;
+def : Pat<(int_loongarch_lasx_insert_128_lo (v4i64 LASX256:$src), (v2i64 LSX128:$lo)),
+ (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 48)>;
+def : Pat<(int_loongarch_lasx_insert_128_hi_s (v8f32 LASX256:$src), (v4f32 LSX128:$lo)),
+ (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 2)>;
+def : Pat<(int_loongarch_lasx_insert_128_hi_d (v4f64 LASX256:$src), (v2f64 LSX128:$lo)),
+ (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 2)>;
+def : Pat<(int_loongarch_lasx_insert_128_hi (v4i64 LASX256:$src), (v2i64 LSX128:$lo)),
+ (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 2)>;
} // Predicates = [HasExtLASX]
/// Intrinsic pattern
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
index 9de4c9d..92a9388 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
@@ -62,6 +62,11 @@ static cl::opt<bool>
cl::desc("Enable the merge base offset pass"),
cl::init(true), cl::Hidden);
+static cl::opt<bool>
+ EnableSinkFold("loongarch-enable-sink-fold",
+ cl::desc("Enable sinking and folding of instruction copies"),
+ cl::init(true), cl::Hidden);
+
static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
return RM.value_or(Reloc::Static);
}
@@ -146,7 +151,9 @@ namespace {
class LoongArchPassConfig : public TargetPassConfig {
public:
LoongArchPassConfig(LoongArchTargetMachine &TM, PassManagerBase &PM)
- : TargetPassConfig(TM, PM) {}
+ : TargetPassConfig(TM, PM) {
+ setEnableSinkAndFold(EnableSinkFold);
+ }
LoongArchTargetMachine &getLoongArchTargetMachine() const {
return getTM<LoongArchTargetMachine>();
diff --git a/llvm/lib/Target/Mips/MipsFastISel.cpp b/llvm/lib/Target/Mips/MipsFastISel.cpp
index df0c8c1..06210b6 100644
--- a/llvm/lib/Target/Mips/MipsFastISel.cpp
+++ b/llvm/lib/Target/Mips/MipsFastISel.cpp
@@ -82,7 +82,7 @@ class MipsFastISel final : public FastISel {
// All possible address modes.
class Address {
public:
- using BaseKind = enum { RegBase, FrameIndexBase };
+ enum BaseKind { RegBase, FrameIndexBase };
private:
BaseKind Kind = RegBase;
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index b260221..f0bdf47 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -2267,7 +2267,7 @@ def : Pat<(f32 (fpround f64:$a)), (CVT_f32_f64 $a, CvtRN)>;
def : Pat<(f32 (fpextend f16:$a)), (CVT_f32_f16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
def : Pat<(f32 (fpextend f16:$a)), (CVT_f32_f16 $a, CvtNONE)>;
// fpextend bf16 -> f32
-def : Pat<(f32 (fpextend bf16:$a)), (CVT_f32_bf16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(f32 (fpextend bf16:$a)), (CVT_f32_bf16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ, hasPTX<78>, hasSM<90>]>;
def : Pat<(f32 (fpextend bf16:$a)), (CVT_f32_bf16 $a, CvtNONE)>, Requires<[hasPTX<71>, hasSM<80>]>;
// fpextend f16 -> f64
diff --git a/llvm/lib/Target/PowerPC/PPCInstrFuture.td b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
index 0c2e44e..dfbbba0 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrFuture.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
@@ -420,6 +420,9 @@ let Predicates = [HasVSX, IsISAFuture] in {
: VXForm_VRTAB5<323, (outs vrrc:$VRT), (ins vrrc:$VRA, vrrc:$VRB),
"vucmprlh $VRT, $VRA, $VRB", []>;
+ def XVRLW: XX3Form_XTAB6<60, 184, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+ "xvrlw $XT, $XA, $XB", []>;
+
// AES Acceleration Instructions
def XXAESENCP : XX3Form_XTABp5_M2<194, (outs vsrprc:$XTp),
(ins vsrprc:$XAp, vsrprc:$XBp, u2imm:$M),
diff --git a/llvm/lib/Target/PowerPC/PPCInstrMMA.td b/llvm/lib/Target/PowerPC/PPCInstrMMA.td
index b38dd4a..fc3cde3 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrMMA.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrMMA.td
@@ -202,7 +202,7 @@ multiclass ACC_UM_M244_XO46<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
RegConstraint<"@earlyclobber $AT">;
def PM#NAME#WPP :
MMIRR_XX3Form_XY4P2_XAB6<
- opcode, !or(xo, 0x20), (outs acc:$AT),
+ opcode, !or(xo, 0x20), (outs wacc:$AT),
!con((ins wacc:$ATi),
!con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
!strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
@@ -765,7 +765,7 @@ let Predicates = [MMA, IsISAFuture] in {
def : Pat<(v512i1 (int_ppc_mma_xvf64gerpn v512i1:$ATi, v256i1:$XA, v16i8:$XB)),
(XVF64GERWPN $ATi, $XA, RCCp.BToVSRC)>;
def : Pat<(v512i1 (int_ppc_mma_xvf64gernp v512i1:$ATi, v256i1:$XA, v16i8:$XB)),
- (XVF64GERNP $ATi, $XA, RCCp.BToVSRC)>;
+ (XVF64GERWNP $ATi, $XA, RCCp.BToVSRC)>;
def : Pat<(v512i1 (int_ppc_mma_xvf64gernn v512i1:$ATi, v256i1:$XA, v16i8:$XB)),
(XVF64GERWNN $ATi, $XA, RCCp.BToVSRC)>;
diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt
index 0ff178e..e9088a4 100644
--- a/llvm/lib/Target/RISCV/CMakeLists.txt
+++ b/llvm/lib/Target/RISCV/CMakeLists.txt
@@ -58,6 +58,7 @@ add_llvm_target(RISCVCodeGen
RISCVMoveMerger.cpp
RISCVOptWInstrs.cpp
RISCVPostRAExpandPseudoInsts.cpp
+ RISCVPromoteConstant.cpp
RISCVPushPopOptimizer.cpp
RISCVRedundantCopyElimination.cpp
RISCVRegisterInfo.cpp
diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h
index ae94101..51e8e85 100644
--- a/llvm/lib/Target/RISCV/RISCV.h
+++ b/llvm/lib/Target/RISCV/RISCV.h
@@ -20,6 +20,7 @@
namespace llvm {
class FunctionPass;
class InstructionSelector;
+class ModulePass;
class PassRegistry;
class RISCVRegisterBankInfo;
class RISCVSubtarget;
@@ -111,6 +112,9 @@ void initializeRISCVO0PreLegalizerCombinerPass(PassRegistry &);
FunctionPass *createRISCVPreLegalizerCombiner();
void initializeRISCVPreLegalizerCombinerPass(PassRegistry &);
+ModulePass *createRISCVPromoteConstantPass();
+void initializeRISCVPromoteConstantPass(PassRegistry &);
+
FunctionPass *createRISCVVLOptimizerPass();
void initializeRISCVVLOptimizerPass(PassRegistry &);
diff --git a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
index 526675a..b0453fc 100644
--- a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
@@ -131,6 +131,7 @@ bool RISCVExpandPseudo::expandMI(MachineBasicBlock &MBB,
case RISCV::PseudoCCMAXU:
case RISCV::PseudoCCMIN:
case RISCV::PseudoCCMINU:
+ case RISCV::PseudoCCMUL:
case RISCV::PseudoCCADDW:
case RISCV::PseudoCCSUBW:
case RISCV::PseudoCCSLL:
@@ -237,6 +238,7 @@ bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB,
case RISCV::PseudoCCMIN: NewOpc = RISCV::MIN; break;
case RISCV::PseudoCCMAXU: NewOpc = RISCV::MAXU; break;
case RISCV::PseudoCCMINU: NewOpc = RISCV::MINU; break;
+ case RISCV::PseudoCCMUL: NewOpc = RISCV::MUL; break;
case RISCV::PseudoCCADDI: NewOpc = RISCV::ADDI; break;
case RISCV::PseudoCCSLLI: NewOpc = RISCV::SLLI; break;
case RISCV::PseudoCCSRLI: NewOpc = RISCV::SRLI; break;
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index cfee6ab..5b72334 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1856,6 +1856,11 @@ def TuneShortForwardBranchIMinMax
"true", "Enable short forward branch optimization for min,max instructions in Zbb",
[TuneShortForwardBranchOpt]>;
+def TuneShortForwardBranchIMul
+ : SubtargetFeature<"short-forward-branch-i-mul", "HasShortForwardBranchIMul",
+ "true", "Enable short forward branch optimization for mul instruction",
+ [TuneShortForwardBranchOpt]>;
+
// Some subtargets require a S2V transfer buffer to move scalars into vectors.
// FIXME: Forming .vx/.vf/.wx/.wf can reduce register pressure.
def TuneNoSinkSplatOperands
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index e0cf739..995ae75 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -9186,7 +9186,7 @@ static SDValue lowerSelectToBinOp(SDNode *N, SelectionDAG &DAG,
unsigned ShAmount = Log2_64(TrueM1);
if (Subtarget.hasShlAdd(ShAmount))
return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, CondV,
- DAG.getConstant(ShAmount, DL, VT), CondV);
+ DAG.getTargetConstant(ShAmount, DL, VT), CondV);
}
}
// (select c, y, 0) -> -c & y
@@ -15463,7 +15463,7 @@ static SDValue transformAddShlImm(SDNode *N, SelectionDAG &DAG,
SDValue NS = (C0 < C1) ? N0->getOperand(0) : N1->getOperand(0);
SDValue NL = (C0 > C1) ? N0->getOperand(0) : N1->getOperand(0);
SDValue SHADD = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, NL,
- DAG.getConstant(Diff, DL, VT), NS);
+ DAG.getTargetConstant(Diff, DL, VT), NS);
return DAG.getNode(ISD::SHL, DL, VT, SHADD, DAG.getConstant(Bits, DL, VT));
}
@@ -15501,7 +15501,7 @@ static SDValue combineShlAddIAddImpl(SDNode *N, SDValue AddI, SDValue Other,
int64_t AddConst = AddVal.getSExtValue();
SDValue SHADD = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, SHLVal->getOperand(0),
- DAG.getConstant(ShlConst, DL, VT), Other);
+ DAG.getTargetConstant(ShlConst, DL, VT), Other);
return DAG.getNode(ISD::ADD, DL, VT, SHADD,
DAG.getSignedConstant(AddConst, DL, VT));
}
@@ -16495,6 +16495,45 @@ static SDValue expandMulToAddOrSubOfShl(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(Op, DL, VT, Shift1, Shift2);
}
+static SDValue getShlAddShlAdd(SDNode *N, SelectionDAG &DAG, unsigned ShX,
+ unsigned ShY, bool AddX) {
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+ SDValue X = N->getOperand(0);
+ SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
+ DAG.getTargetConstant(ShY, DL, VT), X);
+ return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359,
+ DAG.getTargetConstant(ShX, DL, VT), AddX ? X : Mul359);
+}
+
+static SDValue expandMulToShlAddShlAdd(SDNode *N, SelectionDAG &DAG,
+ uint64_t MulAmt) {
+ // 3/5/9 * 3/5/9 -> (shXadd (shYadd X, X), (shYadd X, X))
+ switch (MulAmt) {
+ case 5 * 3:
+ return getShlAddShlAdd(N, DAG, 2, 1, /*AddX=*/false);
+ case 9 * 3:
+ return getShlAddShlAdd(N, DAG, 3, 1, /*AddX=*/false);
+ case 5 * 5:
+ return getShlAddShlAdd(N, DAG, 2, 2, /*AddX=*/false);
+ case 9 * 5:
+ return getShlAddShlAdd(N, DAG, 3, 2, /*AddX=*/false);
+ case 9 * 9:
+ return getShlAddShlAdd(N, DAG, 3, 3, /*AddX=*/false);
+ default:
+ break;
+ }
+
+ // 2/4/8 * 3/5/9 + 1 -> (shXadd (shYadd X, X), X)
+ int ShX;
+ if (int ShY = isShifted359(MulAmt - 1, ShX)) {
+ assert(ShX != 0 && "MulAmt=4,6,10 handled before");
+ if (ShX <= 3)
+ return getShlAddShlAdd(N, DAG, ShX, ShY, /*AddX=*/true);
+ }
+ return SDValue();
+}
+
// Try to expand a scalar multiply to a faster sequence.
static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
@@ -16524,18 +16563,17 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
if (Subtarget.hasVendorXqciac() && isInt<12>(CNode->getSExtValue()))
return SDValue();
- // WARNING: The code below is knowingly incorrect with regards to undef semantics.
- // We're adding additional uses of X here, and in principle, we should be freezing
- // X before doing so. However, adding freeze here causes real regressions, and no
- // other target properly freezes X in these cases either.
- SDValue X = N->getOperand(0);
-
+ // WARNING: The code below is knowingly incorrect with regards to undef
+ // semantics. We're adding additional uses of X here, and in principle, we
+ // should be freezing X before doing so. However, adding freeze here causes
+ // real regressions, and no other target properly freezes X in these cases
+ // either.
if (Subtarget.hasShlAdd(3)) {
+ SDValue X = N->getOperand(0);
int Shift;
if (int ShXAmount = isShifted359(MulAmt, Shift)) {
// 3/5/9 * 2^N -> shl (shXadd X, X), N
SDLoc DL(N);
- SDValue X = N->getOperand(0);
// Put the shift first if we can fold a zext into the shift forming
// a slli.uw.
if (X.getOpcode() == ISD::AND && isa<ConstantSDNode>(X.getOperand(1)) &&
@@ -16543,80 +16581,40 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
SDValue Shl =
DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(Shift, DL, VT));
return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Shl,
- DAG.getConstant(ShXAmount, DL, VT), Shl);
+ DAG.getTargetConstant(ShXAmount, DL, VT), Shl);
}
// Otherwise, put the shl second so that it can fold with following
// instructions (e.g. sext or add).
SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
- DAG.getConstant(ShXAmount, DL, VT), X);
+ DAG.getTargetConstant(ShXAmount, DL, VT), X);
return DAG.getNode(ISD::SHL, DL, VT, Mul359,
DAG.getConstant(Shift, DL, VT));
}
- // 3/5/9 * 3/5/9 -> shXadd (shYadd X, X), (shYadd X, X)
- int ShX;
- int ShY;
- switch (MulAmt) {
- case 3 * 5:
- ShY = 1;
- ShX = 2;
- break;
- case 3 * 9:
- ShY = 1;
- ShX = 3;
- break;
- case 5 * 5:
- ShX = ShY = 2;
- break;
- case 5 * 9:
- ShY = 2;
- ShX = 3;
- break;
- case 9 * 9:
- ShX = ShY = 3;
- break;
- default:
- ShX = ShY = 0;
- break;
- }
- if (ShX) {
+ // 3/5/9 * 3/5/9 * 2^N - In particular, this covers multiples
+ // of 25 which happen to be quite common.
+ // (2/4/8 * 3/5/9 + 1) * 2^N
+ Shift = llvm::countr_zero(MulAmt);
+ if (SDValue V = expandMulToShlAddShlAdd(N, DAG, MulAmt >> Shift)) {
+ if (Shift == 0)
+ return V;
SDLoc DL(N);
- SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
- DAG.getConstant(ShY, DL, VT), X);
- return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359,
- DAG.getConstant(ShX, DL, VT), Mul359);
+ return DAG.getNode(ISD::SHL, DL, VT, V, DAG.getConstant(Shift, DL, VT));
}
// If this is a power 2 + 2/4/8, we can use a shift followed by a single
// shXadd. First check if this a sum of two power of 2s because that's
// easy. Then count how many zeros are up to the first bit.
- if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
- unsigned ScaleShift = llvm::countr_zero(MulAmt);
- if (ScaleShift >= 1 && ScaleShift < 4) {
- unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
- SDLoc DL(N);
- SDValue Shift1 =
- DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT));
- return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
- DAG.getConstant(ScaleShift, DL, VT), Shift1);
- }
+ if (Shift >= 1 && Shift <= 3 && isPowerOf2_64(MulAmt & (MulAmt - 1))) {
+ unsigned ShiftAmt = llvm::countr_zero((MulAmt & (MulAmt - 1)));
+ SDLoc DL(N);
+ SDValue Shift1 =
+ DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT));
+ return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
+ DAG.getTargetConstant(Shift, DL, VT), Shift1);
}
- // 2^(1,2,3) * 3,5,9 + 1 -> (shXadd (shYadd x, x), x)
- // This is the two instruction form, there are also three instruction
- // variants we could implement. e.g.
- // (2^(1,2,3) * 3,5,9 + 1) << C2
- // 2^(C1>3) * 3,5,9 +/- 1
- if (int ShXAmount = isShifted359(MulAmt - 1, Shift)) {
- assert(Shift != 0 && "MulAmt=4,6,10 handled before");
- if (Shift <= 3) {
- SDLoc DL(N);
- SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
- DAG.getConstant(ShXAmount, DL, VT), X);
- return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359,
- DAG.getConstant(Shift, DL, VT), X);
- }
- }
+ // TODO: 2^(C1>3) * 3,5,9 +/- 1
// 2^n + 2/4/8 + 1 -> (add (shl X, C1), (shXadd X, X))
if (MulAmt > 2 && isPowerOf2_64((MulAmt - 1) & (MulAmt - 2))) {
@@ -16626,9 +16624,10 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
SDLoc DL(N);
SDValue Shift1 =
DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT));
- return DAG.getNode(ISD::ADD, DL, VT, Shift1,
- DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
- DAG.getConstant(ScaleShift, DL, VT), X));
+ return DAG.getNode(
+ ISD::ADD, DL, VT, Shift1,
+ DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
+ DAG.getTargetConstant(ScaleShift, DL, VT), X));
}
}
@@ -16643,29 +16642,10 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShAmt, DL, VT));
SDValue Mul359 =
DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
- DAG.getConstant(Log2_64(Offset - 1), DL, VT), X);
+ DAG.getTargetConstant(Log2_64(Offset - 1), DL, VT), X);
return DAG.getNode(ISD::SUB, DL, VT, Shift1, Mul359);
}
}
-
- for (uint64_t Divisor : {3, 5, 9}) {
- if (MulAmt % Divisor != 0)
- continue;
- uint64_t MulAmt2 = MulAmt / Divisor;
- // 3/5/9 * 3/5/9 * 2^N - In particular, this covers multiples
- // of 25 which happen to be quite common.
- if (int ShBAmount = isShifted359(MulAmt2, Shift)) {
- SDLoc DL(N);
- SDValue Mul359A =
- DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
- DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X);
- SDValue Mul359B =
- DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359A,
- DAG.getConstant(ShBAmount, DL, VT), Mul359A);
- return DAG.getNode(ISD::SHL, DL, VT, Mul359B,
- DAG.getConstant(Shift, DL, VT));
- }
- }
}
if (SDValue V = expandMulToAddOrSubOfShl(N, DAG, MulAmt))
@@ -25320,3 +25300,12 @@ ArrayRef<MCPhysReg> RISCVTargetLowering::getRoundingControlRegisters() const {
}
return {};
}
+
+bool RISCVTargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
+ EVT VT = Y.getValueType();
+
+ if (VT.isVector())
+ return false;
+
+ return VT.getSizeInBits() <= Subtarget.getXLen();
+}
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 9e3e2a9..dd62a9c 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -465,6 +465,8 @@ public:
ArrayRef<MCPhysReg> getRoundingControlRegisters() const override;
+ bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override;
+
/// Match a mask which "spreads" the leading elements of a vector evenly
/// across the result. Factor is the spread amount, and Index is the
/// offset applied.
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index 636e31c..bf9de0a 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -1583,7 +1583,10 @@ void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) {
if (!TII->isAddImmediate(*DeadMI, Reg))
continue;
LIS->RemoveMachineInstrFromMaps(*DeadMI);
+ Register AddReg = DeadMI->getOperand(1).getReg();
DeadMI->eraseFromParent();
+ if (AddReg.isVirtual())
+ LIS->shrinkToUses(&LIS->getInterval(AddReg));
}
}
}
@@ -1869,11 +1872,15 @@ void RISCVInsertVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) const {
// Loop over the dead AVL values, and delete them now. This has
// to be outside the above loop to avoid invalidating iterators.
for (auto *MI : ToDelete) {
+ assert(MI->getOpcode() == RISCV::ADDI);
+ Register AddReg = MI->getOperand(1).getReg();
if (LIS) {
LIS->removeInterval(MI->getOperand(0).getReg());
LIS->RemoveMachineInstrFromMaps(*MI);
}
MI->eraseFromParent();
+ if (LIS && AddReg.isVirtual())
+ LIS->shrinkToUses(&LIS->getInterval(AddReg));
}
}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index c9df787..b8ab70b 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -1703,6 +1703,7 @@ unsigned getPredicatedOpcode(unsigned Opcode) {
case RISCV::MAXU: return RISCV::PseudoCCMAXU;
case RISCV::MIN: return RISCV::PseudoCCMIN;
case RISCV::MINU: return RISCV::PseudoCCMINU;
+ case RISCV::MUL: return RISCV::PseudoCCMUL;
case RISCV::ADDI: return RISCV::PseudoCCADDI;
case RISCV::SLLI: return RISCV::PseudoCCSLLI;
@@ -1754,6 +1755,9 @@ static MachineInstr *canFoldAsPredicatedOp(Register Reg,
MI->getOpcode() == RISCV::MINU || MI->getOpcode() == RISCV::MAXU))
return nullptr;
+ if (!STI.hasShortForwardBranchIMul() && MI->getOpcode() == RISCV::MUL)
+ return nullptr;
+
// Check if MI can be predicated and folded into the CCMOV.
if (getPredicatedOpcode(MI->getOpcode()) == RISCV::INSTRUCTION_LIST_END)
return nullptr;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td b/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td
index 5a67a5a..494b1c9 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td
@@ -110,6 +110,7 @@ def PseudoCCMAX : SFBALU_rr;
def PseudoCCMIN : SFBALU_rr;
def PseudoCCMAXU : SFBALU_rr;
def PseudoCCMINU : SFBALU_rr;
+def PseudoCCMUL : SFBALU_rr;
def PseudoCCADDI : SFBALU_ri;
def PseudoCCANDI : SFBALU_ri;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
index b37ceaae..c2b25c6 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
@@ -60,6 +60,8 @@ def immfour : RISCVOp {
let DecoderMethod = "decodeImmFourOperand";
}
+def tuimm2 : TImmLeaf<XLenVT, [{return isUInt<2>(Imm);}]>;
+
//===----------------------------------------------------------------------===//
// Instruction class templates
//===----------------------------------------------------------------------===//
@@ -557,8 +559,8 @@ multiclass VPatTernaryVMAQA_VV_VX<string intrinsic, string instruction,
let Predicates = [HasVendorXTHeadBa] in {
def : Pat<(add_like_non_imm12 (shl GPR:$rs2, uimm2:$uimm2), (XLenVT GPR:$rs1)),
(TH_ADDSL GPR:$rs1, GPR:$rs2, uimm2:$uimm2)>;
-def : Pat<(XLenVT (riscv_shl_add GPR:$rs2, uimm2:$uimm2, GPR:$rs1)),
- (TH_ADDSL GPR:$rs1, GPR:$rs2, uimm2:$uimm2)>;
+def : Pat<(XLenVT (riscv_shl_add GPR:$rs2, tuimm2:$uimm2, GPR:$rs1)),
+ (TH_ADDSL GPR:$rs1, GPR:$rs2, tuimm2:$uimm2)>;
// Reuse complex patterns from StdExtZba
def : Pat<(add_like_non_imm12 sh1add_op:$rs2, (XLenVT GPR:$rs1)),
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index 4537bfe..8376da5 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -53,6 +53,8 @@ def uimm5gt3 : RISCVOp<XLenVT>, ImmLeaf<XLenVT,
let OperandType = "OPERAND_UIMM5_GT3";
}
+def tuimm5gt3 : TImmLeaf<XLenVT, [{return (Imm > 3) && isUInt<5>(Imm);}]>;
+
def UImm5Plus1AsmOperand : AsmOperandClass {
let Name = "UImm5Plus1";
let RenderMethod = "addImmOperands";
@@ -1419,8 +1421,8 @@ def : Pat<(i32 (add GPRNoX0:$rd, (mul GPRNoX0:$rs1, simm12_lo:$imm12))),
(QC_MULIADD GPRNoX0:$rd, GPRNoX0:$rs1, simm12_lo:$imm12)>;
def : Pat<(i32 (add_like_non_imm12 (shl GPRNoX0:$rs1, (i32 uimm5gt3:$imm)), GPRNoX0:$rs2)),
(QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, uimm5gt3:$imm)>;
-def : Pat<(i32 (riscv_shl_add GPRNoX0:$rs1, (i32 uimm5gt3:$imm), GPRNoX0:$rs2)),
- (QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, uimm5gt3:$imm)>;
+def : Pat<(i32 (riscv_shl_add GPRNoX0:$rs1, (i32 tuimm5gt3:$imm), GPRNoX0:$rs2)),
+ (QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, tuimm5gt3:$imm)>;
} // Predicates = [HasVendorXqciac, IsRV32]
/// Simple arithmetic operations
diff --git a/llvm/lib/Target/RISCV/RISCVPromoteConstant.cpp b/llvm/lib/Target/RISCV/RISCVPromoteConstant.cpp
new file mode 100644
index 0000000..bf1f69f
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVPromoteConstant.cpp
@@ -0,0 +1,213 @@
+//==- RISCVPromoteConstant.cpp - Promote constant fp to global for RISC-V --==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "RISCV.h"
+#include "RISCVSubtarget.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "riscv-promote-const"
+#define RISCV_PROMOTE_CONSTANT_NAME "RISC-V Promote Constants"
+
+STATISTIC(NumPromoted, "Number of constant literals promoted to globals");
+STATISTIC(NumPromotedUses, "Number of uses of promoted literal constants");
+
+namespace {
+
+class RISCVPromoteConstant : public ModulePass {
+public:
+ static char ID;
+ RISCVPromoteConstant() : ModulePass(ID) {}
+
+ StringRef getPassName() const override { return RISCV_PROMOTE_CONSTANT_NAME; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetPassConfig>();
+ AU.setPreservesCFG();
+ }
+
+ /// Iterate over the functions and promote the double fp constants that
+ /// would otherwise go into the constant pool to a constant array.
+ bool runOnModule(Module &M) override {
+ if (skipModule(M))
+ return false;
+ // TargetMachine and Subtarget are needed to query isFPImmlegal.
+ const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
+ const TargetMachine &TM = TPC.getTM<TargetMachine>();
+ bool Changed = false;
+ for (Function &F : M) {
+ const RISCVSubtarget &ST = TM.getSubtarget<RISCVSubtarget>(F);
+ const RISCVTargetLowering *TLI = ST.getTargetLowering();
+ Changed |= runOnFunction(F, TLI);
+ }
+ return Changed;
+ }
+
+private:
+ bool runOnFunction(Function &F, const RISCVTargetLowering *TLI);
+};
+} // end anonymous namespace
+
+char RISCVPromoteConstant::ID = 0;
+
+INITIALIZE_PASS(RISCVPromoteConstant, DEBUG_TYPE, RISCV_PROMOTE_CONSTANT_NAME,
+ false, false)
+
+ModulePass *llvm::createRISCVPromoteConstantPass() {
+ return new RISCVPromoteConstant();
+}
+
+bool RISCVPromoteConstant::runOnFunction(Function &F,
+ const RISCVTargetLowering *TLI) {
+ if (F.hasOptNone() || F.hasOptSize())
+ return false;
+
+ // Bail out and make no transformation if the target doesn't support
+ // doubles, or if we're not targeting RV64 as we currently see some
+ // regressions for those targets.
+ if (!TLI->isTypeLegal(MVT::f64) || !TLI->isTypeLegal(MVT::i64))
+ return false;
+
+ // Collect all unique double constants and their uses in the function. Use
+ // MapVector to preserve insertion order.
+ MapVector<ConstantFP *, SmallVector<Use *, 8>> ConstUsesMap;
+
+ for (Instruction &I : instructions(F)) {
+ for (Use &U : I.operands()) {
+ auto *C = dyn_cast<ConstantFP>(U.get());
+ if (!C || !C->getType()->isDoubleTy())
+ continue;
+ // Do not promote if it wouldn't be loaded from the constant pool.
+ if (TLI->isFPImmLegal(C->getValueAPF(), MVT::f64,
+ /*ForCodeSize=*/false))
+ continue;
+ // Do not promote a constant if it is used as an immediate argument
+ // for an intrinsic.
+ if (auto *II = dyn_cast<IntrinsicInst>(U.getUser())) {
+ Function *IntrinsicFunc = II->getFunction();
+ unsigned OperandIdx = U.getOperandNo();
+ if (IntrinsicFunc && IntrinsicFunc->getAttributes().hasParamAttr(
+ OperandIdx, Attribute::ImmArg)) {
+ LLVM_DEBUG(dbgs() << "Skipping promotion of constant in: " << *II
+ << " because operand " << OperandIdx
+ << " must be an immediate.\n");
+ continue;
+ }
+ }
+ // Note: FP args to inline asm would be problematic if we had a
+ // constraint that required an immediate floating point operand. At the
+ // time of writing LLVM doesn't recognise such a constraint.
+ ConstUsesMap[C].push_back(&U);
+ }
+ }
+
+ int PromotableConstants = ConstUsesMap.size();
+ LLVM_DEBUG(dbgs() << "Found " << PromotableConstants
+ << " promotable constants in " << F.getName() << "\n");
+ // Bail out if no promotable constants found, or if only one is found.
+ if (PromotableConstants < 2) {
+ LLVM_DEBUG(dbgs() << "Performing no promotions as insufficient promotable "
+ "constants found\n");
+ return false;
+ }
+
+ NumPromoted += PromotableConstants;
+
+ // Create a global array containing the promoted constants.
+ Module *M = F.getParent();
+ Type *DoubleTy = Type::getDoubleTy(M->getContext());
+
+ SmallVector<Constant *, 16> ConstantVector;
+ for (auto const &Pair : ConstUsesMap)
+ ConstantVector.push_back(Pair.first);
+
+ ArrayType *ArrayTy = ArrayType::get(DoubleTy, ConstantVector.size());
+ Constant *GlobalArrayInitializer =
+ ConstantArray::get(ArrayTy, ConstantVector);
+
+ auto *GlobalArray = new GlobalVariable(
+ *M, ArrayTy,
+ /*isConstant=*/true, GlobalValue::InternalLinkage, GlobalArrayInitializer,
+ ".promoted_doubles." + F.getName());
+
+ // A cache to hold the loaded value for a given constant within a basic block.
+ DenseMap<std::pair<ConstantFP *, BasicBlock *>, Value *> LocalLoads;
+
+ // Replace all uses with the loaded value.
+ unsigned Idx = 0;
+ for (auto const &Pair : ConstUsesMap) {
+ ConstantFP *Const = Pair.first;
+ const SmallVector<Use *, 8> &Uses = Pair.second;
+
+ for (Use *U : Uses) {
+ Instruction *UserInst = cast<Instruction>(U->getUser());
+ BasicBlock *InsertionBB;
+
+ // If the user is a PHI node, we must insert the load in the
+ // corresponding predecessor basic block. Otherwise, it's inserted into
+ // the same block as the use.
+ if (auto *PN = dyn_cast<PHINode>(UserInst))
+ InsertionBB = PN->getIncomingBlock(*U);
+ else
+ InsertionBB = UserInst->getParent();
+
+ if (isa<CatchSwitchInst>(InsertionBB->getTerminator())) {
+ LLVM_DEBUG(dbgs() << "Bailing out: catchswitch means thre is no valid "
+ "insertion point.\n");
+ return false;
+ }
+
+ auto CacheKey = std::make_pair(Const, InsertionBB);
+ Value *LoadedVal = nullptr;
+
+ // Re-use a load if it exists in the insertion block.
+ if (LocalLoads.count(CacheKey)) {
+ LoadedVal = LocalLoads.at(CacheKey);
+ } else {
+ // Otherwise, create a new GEP and Load at the correct insertion point.
+ // It is always safe to insert in the first insertion point in the BB,
+ // so do that and let other passes reorder.
+ IRBuilder<> Builder(InsertionBB, InsertionBB->getFirstInsertionPt());
+ Value *ElementPtr = Builder.CreateConstInBoundsGEP2_64(
+ GlobalArray->getValueType(), GlobalArray, 0, Idx, "double.addr");
+ LoadedVal = Builder.CreateLoad(DoubleTy, ElementPtr, "double.val");
+
+ // Cache the newly created load for this block.
+ LocalLoads[CacheKey] = LoadedVal;
+ }
+
+ U->set(LoadedVal);
+ ++NumPromotedUses;
+ }
+ ++Idx;
+ }
+
+ return true;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index ae54ff1..16ef67d 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -139,6 +139,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
initializeRISCVExpandAtomicPseudoPass(*PR);
initializeRISCVRedundantCopyEliminationPass(*PR);
initializeRISCVAsmPrinterPass(*PR);
+ initializeRISCVPromoteConstantPass(*PR);
}
static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
@@ -462,6 +463,8 @@ void RISCVPassConfig::addIRPasses() {
}
bool RISCVPassConfig::addPreISel() {
+ if (TM->getOptLevel() != CodeGenOptLevel::None)
+ addPass(createRISCVPromoteConstantPass());
if (TM->getOptLevel() != CodeGenOptLevel::None) {
// Add a barrier before instruction selection so that we will not get
// deleted block address after enabling default outlining. See D99707 for
diff --git a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
index 640b014..0175f2f 100644
--- a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
@@ -577,6 +577,11 @@ void SPIRVAsmPrinter::outputExecutionMode(const Module &M) {
if (MDNode *Node = F.getMetadata("intel_reqd_sub_group_size"))
outputExecutionModeFromMDNode(FReg, Node,
SPIRV::ExecutionMode::SubgroupSize, 0, 0);
+ if (MDNode *Node = F.getMetadata("max_work_group_size")) {
+ if (ST->canUseExtension(SPIRV::Extension::SPV_INTEL_kernel_attributes))
+ outputExecutionModeFromMDNode(
+ FReg, Node, SPIRV::ExecutionMode::MaxWorkgroupSizeINTEL, 3, 1);
+ }
if (MDNode *Node = F.getMetadata("vec_type_hint")) {
MCInst Inst;
Inst.setOpcode(SPIRV::OpExecutionMode);
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
index 56a38bb..b2cbdb2 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
@@ -2390,6 +2390,15 @@ static bool generateBindlessImageINTELInst(const SPIRV::IncomingCall *Call,
return buildBindlessImageINTELInst(Call, Opcode, MIRBuilder, GR);
}
+static bool generateBlockingPipesInst(const SPIRV::IncomingCall *Call,
+ MachineIRBuilder &MIRBuilder,
+ SPIRVGlobalRegistry *GR) {
+ const SPIRV::DemangledBuiltin *Builtin = Call->Builtin;
+ unsigned Opcode =
+ SPIRV::lookupNativeBuiltin(Builtin->Name, Builtin->Set)->Opcode;
+ return buildOpFromWrapper(MIRBuilder, Opcode, Call, Register(0));
+}
+
static bool
generateTernaryBitwiseFunctionINTELInst(const SPIRV::IncomingCall *Call,
MachineIRBuilder &MIRBuilder,
@@ -3050,6 +3059,8 @@ std::optional<bool> lowerBuiltin(const StringRef DemangledCall,
return generatePipeInst(Call.get(), MIRBuilder, GR);
case SPIRV::PredicatedLoadStore:
return generatePredicatedLoadStoreInst(Call.get(), MIRBuilder, GR);
+ case SPIRV::BlockingPipes:
+ return generateBlockingPipesInst(Call.get(), MIRBuilder, GR);
}
return false;
}
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
index c259cce..492a98e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
@@ -71,6 +71,7 @@ def TernaryBitwiseINTEL : BuiltinGroup;
def Block2DLoadStore : BuiltinGroup;
def Pipe : BuiltinGroup;
def PredicatedLoadStore : BuiltinGroup;
+def BlockingPipes : BuiltinGroup;
//===----------------------------------------------------------------------===//
// Class defining a demangled builtin record. The information in the record
@@ -1174,6 +1175,10 @@ defm : DemangledNativeBuiltin<"clock_read_sub_group", OpenCL_std, KernelClock, 0
defm : DemangledNativeBuiltin<"clock_read_hilo_device", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>;
defm : DemangledNativeBuiltin<"clock_read_hilo_work_group", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>;
defm : DemangledNativeBuiltin<"clock_read_hilo_sub_group", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>;
+
+//SPV_ALTERA_blocking_pipes
+defm : DemangledNativeBuiltin<"__spirv_WritePipeBlockingINTEL", OpenCL_std, BlockingPipes, 0, 0, OpWritePipeBlockingALTERA>;
+defm : DemangledNativeBuiltin<"__spirv_ReadPipeBlockingINTEL", OpenCL_std, BlockingPipes, 0, 0, OpReadPipeBlockingALTERA>;
defm : DemangledNativeBuiltin<"__spirv_ReadClockKHR", OpenCL_std, KernelClock, 1, 1, OpReadClockKHR>;
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
index 96f5dee..f681b0d 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
@@ -107,6 +107,8 @@ static const std::map<std::string, SPIRV::Extension::Extension, std::less<>>
SPIRV::Extension::Extension::SPV_INTEL_inline_assembly},
{"SPV_INTEL_bindless_images",
SPIRV::Extension::Extension::SPV_INTEL_bindless_images},
+ {"SPV_INTEL_bfloat16_arithmetic",
+ SPIRV::Extension::Extension::SPV_INTEL_bfloat16_arithmetic},
{"SPV_INTEL_bfloat16_conversion",
SPIRV::Extension::Extension::SPV_INTEL_bfloat16_conversion},
{"SPV_KHR_subgroup_rotate",
@@ -155,7 +157,11 @@ static const std::map<std::string, SPIRV::Extension::Extension, std::less<>>
{"SPV_INTEL_predicated_io",
SPIRV::Extension::Extension::SPV_INTEL_predicated_io},
{"SPV_KHR_maximal_reconvergence",
- SPIRV::Extension::Extension::SPV_KHR_maximal_reconvergence}};
+ SPIRV::Extension::Extension::SPV_KHR_maximal_reconvergence},
+ {"SPV_INTEL_kernel_attributes",
+ SPIRV::Extension::Extension::SPV_INTEL_kernel_attributes},
+ {"SPV_ALTERA_blocking_pipes",
+ SPIRV::Extension::Extension::SPV_ALTERA_blocking_pipes}};
bool SPIRVExtensionsParser::parse(cl::Option &O, StringRef ArgName,
StringRef ArgValue,
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
index a61351e..03bd61b 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
@@ -993,3 +993,9 @@ def OpPredicatedLoadINTEL: Op<6528, (outs ID:$res), (ins TYPE:$resType, ID:$ptr,
"$res = OpPredicatedLoadINTEL $resType $ptr $predicate $default_value">;
def OpPredicatedStoreINTEL: Op<6529, (outs), (ins ID:$ptr, ID:$object, ID:$predicate, variable_ops),
"OpPredicatedStoreINTEL $ptr $object $predicate">;
+
+//SPV_ALTERA_blocking_pipes
+def OpReadPipeBlockingALTERA :Op<5946, (outs), (ins ID:$pipe, ID:$pointer, ID:$packetSize, ID:$packetAlignment),
+ "OpReadPipeBlockingALTERA $pipe $pointer $packetSize $packetAlignment">;
+def OpWritePipeBlockingALTERA :Op<5946, (outs), (ins ID:$pipe, ID:$pointer, ID:$packetSize, ID:$packetAlignment),
+ "OpWritePipeBlockingALTERA $pipe $pointer $packetSize $packetAlignment">;
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 3f0424f..245e5a2 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -3516,6 +3516,10 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
case Intrinsic::spv_resource_nonuniformindex: {
return selectResourceNonUniformIndex(ResVReg, ResType, I);
}
+ case Intrinsic::spv_unpackhalf2x16: {
+ return selectExtInst(ResVReg, ResType, I, GL::UnpackHalf2x16);
+ }
+
default: {
std::string DiagMsg;
raw_string_ostream OS(DiagMsg);
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index db036a5..af76016 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -1435,6 +1435,8 @@ void addInstrRequirements(const MachineInstr &MI,
addPrintfRequirements(MI, Reqs, ST);
break;
}
+ // TODO: handle bfloat16 extended instructions when
+ // SPV_INTEL_bfloat16_arithmetic is enabled.
break;
}
case SPIRV::OpAliasDomainDeclINTEL:
@@ -1883,6 +1885,13 @@ void addInstrRequirements(const MachineInstr &MI,
Reqs.addCapability(
SPIRV::Capability::CooperativeMatrixCheckedInstructionsINTEL);
break;
+ case SPIRV::OpReadPipeBlockingALTERA:
+ case SPIRV::OpWritePipeBlockingALTERA:
+ if (ST.canUseExtension(SPIRV::Extension::SPV_ALTERA_blocking_pipes)) {
+ Reqs.addExtension(SPIRV::Extension::SPV_ALTERA_blocking_pipes);
+ Reqs.addCapability(SPIRV::Capability::BlockingPipesALTERA);
+ }
+ break;
case SPIRV::OpCooperativeMatrixGetElementCoordINTEL:
if (!ST.canUseExtension(SPIRV::Extension::SPV_INTEL_joint_matrix))
report_fatal_error("OpCooperativeMatrixGetElementCoordINTEL requires the "
@@ -2060,7 +2069,64 @@ void addInstrRequirements(const MachineInstr &MI,
Reqs.addCapability(SPIRV::Capability::PredicatedIOINTEL);
break;
}
-
+ case SPIRV::OpFAddS:
+ case SPIRV::OpFSubS:
+ case SPIRV::OpFMulS:
+ case SPIRV::OpFDivS:
+ case SPIRV::OpFRemS:
+ case SPIRV::OpFMod:
+ case SPIRV::OpFNegate:
+ case SPIRV::OpFAddV:
+ case SPIRV::OpFSubV:
+ case SPIRV::OpFMulV:
+ case SPIRV::OpFDivV:
+ case SPIRV::OpFRemV:
+ case SPIRV::OpFNegateV: {
+ const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+ SPIRVType *TypeDef = MRI.getVRegDef(MI.getOperand(1).getReg());
+ if (TypeDef->getOpcode() == SPIRV::OpTypeVector)
+ TypeDef = MRI.getVRegDef(TypeDef->getOperand(1).getReg());
+ if (isBFloat16Type(TypeDef)) {
+ if (!ST.canUseExtension(SPIRV::Extension::SPV_INTEL_bfloat16_arithmetic))
+ report_fatal_error(
+ "Arithmetic instructions with bfloat16 arguments require the "
+ "following SPIR-V extension: SPV_INTEL_bfloat16_arithmetic",
+ false);
+ Reqs.addExtension(SPIRV::Extension::SPV_INTEL_bfloat16_arithmetic);
+ Reqs.addCapability(SPIRV::Capability::BFloat16ArithmeticINTEL);
+ }
+ break;
+ }
+ case SPIRV::OpOrdered:
+ case SPIRV::OpUnordered:
+ case SPIRV::OpFOrdEqual:
+ case SPIRV::OpFOrdNotEqual:
+ case SPIRV::OpFOrdLessThan:
+ case SPIRV::OpFOrdLessThanEqual:
+ case SPIRV::OpFOrdGreaterThan:
+ case SPIRV::OpFOrdGreaterThanEqual:
+ case SPIRV::OpFUnordEqual:
+ case SPIRV::OpFUnordNotEqual:
+ case SPIRV::OpFUnordLessThan:
+ case SPIRV::OpFUnordLessThanEqual:
+ case SPIRV::OpFUnordGreaterThan:
+ case SPIRV::OpFUnordGreaterThanEqual: {
+ const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+ MachineInstr *OperandDef = MRI.getVRegDef(MI.getOperand(2).getReg());
+ SPIRVType *TypeDef = MRI.getVRegDef(OperandDef->getOperand(1).getReg());
+ if (TypeDef->getOpcode() == SPIRV::OpTypeVector)
+ TypeDef = MRI.getVRegDef(TypeDef->getOperand(1).getReg());
+ if (isBFloat16Type(TypeDef)) {
+ if (!ST.canUseExtension(SPIRV::Extension::SPV_INTEL_bfloat16_arithmetic))
+ report_fatal_error(
+ "Relational instructions with bfloat16 arguments require the "
+ "following SPIR-V extension: SPV_INTEL_bfloat16_arithmetic",
+ false);
+ Reqs.addExtension(SPIRV::Extension::SPV_INTEL_bfloat16_arithmetic);
+ Reqs.addCapability(SPIRV::Capability::BFloat16ArithmeticINTEL);
+ }
+ break;
+ }
default:
break;
}
@@ -2180,6 +2246,10 @@ static void collectReqs(const Module &M, SPIRV::ModuleAnalysisInfo &MAI,
MAI.Reqs.getAndAddRequirements(
SPIRV::OperandCategory::ExecutionModeOperand,
SPIRV::ExecutionMode::SubgroupSize, ST);
+ if (F.getMetadata("max_work_group_size"))
+ MAI.Reqs.getAndAddRequirements(
+ SPIRV::OperandCategory::ExecutionModeOperand,
+ SPIRV::ExecutionMode::MaxWorkgroupSizeINTEL, ST);
if (F.getMetadata("vec_type_hint"))
MAI.Reqs.getAndAddRequirements(
SPIRV::OperandCategory::ExecutionModeOperand,
diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
index 4e4e6fb..be88f33 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
@@ -56,6 +56,13 @@ public:
}
};
+static cl::list<std::string> SPVAllowUnknownIntrinsics(
+ "spv-allow-unknown-intrinsics", cl::CommaSeparated,
+ cl::desc("Emit unknown intrinsics as calls to external functions. A "
+ "comma-separated input list of intrinsic prefixes must be "
+ "provided, and only intrinsics carrying a listed prefix get "
+ "emitted as described."),
+ cl::value_desc("intrinsic_prefix_0,intrinsic_prefix_1"), cl::ValueOptional);
} // namespace
char SPIRVPrepareFunctions::ID = 0;
@@ -445,6 +452,15 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) {
EraseFromParent);
Changed = true;
break;
+ default:
+ if (TM.getTargetTriple().getVendor() == Triple::AMD ||
+ any_of(SPVAllowUnknownIntrinsics, [II](auto &&Prefix) {
+ if (Prefix.empty())
+ return false;
+ return II->getCalledFunction()->getName().starts_with(Prefix);
+ }))
+ Changed |= lowerIntrinsicToFunction(II);
+ break;
}
}
}
diff --git a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
index ba09692..ad6c9cd 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
@@ -70,7 +70,6 @@ SPIRVSubtarget::SPIRVSubtarget(const Triple &TT, const std::string &CPU,
SPIRVVersion = VersionTuple(1, 3);
break;
case Triple::SPIRVSubArch_v14:
- default:
SPIRVVersion = VersionTuple(1, 4);
break;
case Triple::SPIRVSubArch_v15:
@@ -79,13 +78,19 @@ SPIRVSubtarget::SPIRVSubtarget(const Triple &TT, const std::string &CPU,
case Triple::SPIRVSubArch_v16:
SPIRVVersion = VersionTuple(1, 6);
break;
+ default:
+ if (TT.getVendor() == Triple::AMD)
+ SPIRVVersion = VersionTuple(1, 6);
+ else
+ SPIRVVersion = VersionTuple(1, 4);
}
OpenCLVersion = VersionTuple(2, 2);
// Set the environment based on the target triple.
if (TargetTriple.getOS() == Triple::Vulkan)
Env = Shader;
- else if (TargetTriple.getEnvironment() == Triple::OpenCL)
+ else if (TargetTriple.getEnvironment() == Triple::OpenCL ||
+ TargetTriple.getVendor() == Triple::AMD)
Env = Kernel;
else
Env = Unknown;
@@ -93,6 +98,8 @@ SPIRVSubtarget::SPIRVSubtarget(const Triple &TT, const std::string &CPU,
// Set the default extensions based on the target triple.
if (TargetTriple.getVendor() == Triple::Intel)
Extensions.insert(SPIRV::Extension::SPV_INTEL_function_pointers);
+ if (TargetTriple.getVendor() == Triple::AMD)
+ Extensions = SPIRVExtensionsParser::getValidExtensions(TargetTriple);
// The order of initialization is important.
initAvailableExtensions(Extensions);
diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
index 7d08b29..65a8885 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
+++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
@@ -309,7 +309,7 @@ defm SPV_KHR_shader_clock : ExtensionOperand<54, [EnvVulkan, EnvOpenCL]>;
defm SPV_INTEL_unstructured_loop_controls : ExtensionOperand<55, [EnvOpenCL]>;
defm SPV_EXT_demote_to_helper_invocation : ExtensionOperand<56, [EnvVulkan]>;
defm SPV_INTEL_fpga_reg : ExtensionOperand<57, [EnvOpenCL]>;
-defm SPV_INTEL_blocking_pipes : ExtensionOperand<58, [EnvOpenCL]>;
+defm SPV_ALTERA_blocking_pipes : ExtensionOperand<58, [EnvOpenCL]>;
defm SPV_GOOGLE_user_type : ExtensionOperand<59, [EnvVulkan]>;
defm SPV_KHR_physical_storage_buffer : ExtensionOperand<60, [EnvVulkan]>;
defm SPV_INTEL_kernel_attributes : ExtensionOperand<61, [EnvOpenCL]>;
@@ -387,6 +387,8 @@ defm SPV_INTEL_tensor_float32_conversion : ExtensionOperand<125, [EnvOpenCL]>;
defm SPV_KHR_bfloat16 : ExtensionOperand<126, [EnvVulkan, EnvOpenCL]>;
defm SPV_INTEL_predicated_io : ExtensionOperand<127, [EnvOpenCL]>;
defm SPV_KHR_maximal_reconvergence : ExtensionOperand<128, [EnvVulkan]>;
+defm SPV_INTEL_bfloat16_arithmetic
+ : ExtensionOperand<129, [EnvVulkan, EnvOpenCL]>;
//===----------------------------------------------------------------------===//
// Multiclass used to define Capabilities enum values and at the same time
@@ -570,6 +572,7 @@ defm AtomicFloat64MinMaxEXT : CapabilityOperand<5613, 0, 0, [SPV_EXT_shader_atom
defm VariableLengthArrayINTEL : CapabilityOperand<5817, 0, 0, [SPV_INTEL_variable_length_array], []>;
defm GroupUniformArithmeticKHR : CapabilityOperand<6400, 0, 0, [SPV_KHR_uniform_group_instructions], []>;
defm USMStorageClassesINTEL : CapabilityOperand<5935, 0, 0, [SPV_INTEL_usm_storage_classes], [Kernel]>;
+defm BFloat16ArithmeticINTEL : CapabilityOperand<6226, 0, 0, [SPV_INTEL_bfloat16_arithmetic], []>;
defm BFloat16ConversionINTEL : CapabilityOperand<6115, 0, 0, [SPV_INTEL_bfloat16_conversion], []>;
defm GlobalVariableHostAccessINTEL : CapabilityOperand<6187, 0, 0, [SPV_INTEL_global_variable_host_access], []>;
defm HostAccessINTEL : CapabilityOperand<6188, 0, 0, [SPV_INTEL_global_variable_host_access], []>;
@@ -587,6 +590,11 @@ defm CooperativeMatrixBFloat16ComponentTypeINTEL : CapabilityOperand<6437, 0, 0,
defm RoundToInfinityINTEL : CapabilityOperand<5582, 0, 0, [SPV_INTEL_float_controls2], []>;
defm FloatingPointModeINTEL : CapabilityOperand<5583, 0, 0, [SPV_INTEL_float_controls2], []>;
defm FunctionFloatControlINTEL : CapabilityOperand<5821, 0, 0, [SPV_INTEL_float_controls2], []>;
+defm KernelAttributesINTEL : CapabilityOperand<5892, 0, 0, [SPV_INTEL_kernel_attributes], [Kernel]>;
+// TODO-SPIRV: add these once they are used / tested.
+// defm FPGAKernelAttributesINTEL : CapabilityOperand<5897, 0, 0, [SPV_INTEL_kernel_attributes], [Kernel]>;
+// defm FPGAKernelAttributesv2INTEL : CapabilityOperand<6161, 0, 0, [SPV_INTEL_kernel_attributes], [Kernel]>;
+// END TODO-SPIRV
defm LongCompositesINTEL : CapabilityOperand<6089, 0, 0, [SPV_INTEL_long_composites], []>;
defm BindlessImagesINTEL : CapabilityOperand<6528, 0, 0, [SPV_INTEL_bindless_images], []>;
defm MemoryAccessAliasingINTEL : CapabilityOperand<5910, 0, 0, [SPV_INTEL_memory_access_aliasing], []>;
@@ -603,6 +611,7 @@ defm TensorFloat32RoundingINTEL : CapabilityOperand<6425, 0, 0, [SPV_INTEL_tenso
defm BFloat16TypeKHR : CapabilityOperand<5116, 0, 0, [SPV_KHR_bfloat16], []>;
defm BFloat16DotProductKHR : CapabilityOperand<5117, 0, 0, [SPV_KHR_bfloat16], [BFloat16TypeKHR]>;
defm BFloat16CooperativeMatrixKHR : CapabilityOperand<5118, 0, 0, [SPV_KHR_bfloat16], [BFloat16TypeKHR, CooperativeMatrixKHR]>;
+defm BlockingPipesALTERA : CapabilityOperand<5945, 0, 0, [SPV_ALTERA_blocking_pipes], []>;
//===----------------------------------------------------------------------===//
// Multiclass used to define SourceLanguage enum values and at the same time
@@ -805,6 +814,15 @@ defm RoundingModeRTPINTEL : ExecutionModeOperand<5620, [RoundToInfinityINTEL]>;
defm RoundingModeRTNINTEL : ExecutionModeOperand<5621, [RoundToInfinityINTEL]>;
defm FloatingPointModeALTINTEL : ExecutionModeOperand<5622, [FloatingPointModeINTEL]>;
defm FloatingPointModeIEEEINTEL : ExecutionModeOperand<5623, [FloatingPointModeINTEL]>;
+defm MaxWorkgroupSizeINTEL : ExecutionModeOperand<5893, [KernelAttributesINTEL]>;
+// TODO-SPIRV: Add the following once they are used / tested.
+// defm MaxWorkDimINTEL : ExecutionModeOperand<5894, [KernelAttributesINTEL]>;
+// defm NoGlobalOffsetINTEL : ExecutionModeOperand<5895, [KernelAttributesINTEL]>;
+// defm NumSIMDWorkitemsINTEL : ExecutionModeOperand<5896, [FPGAKernelAttributesINTEL]>;
+// defm SchedulerTargetFmaxMhzINTEL : ExecutionModeOperand<5903, [FPGAKernelAttributesINTEL]>;
+// defm StreamingInterfaceINTEL : ExecutionModeOperand<6154, [FPGAKernelAttributesv2INTEL]>;
+// defm RegisterMapInterfaceINTEL : ExecutionModeOperand<6160, [FPGAKernelAttributesv2INTEL]>;
+// END TODO-SPIRV
defm FPFastMathDefault : ExecutionModeOperand<6028, [FloatControls2]>;
defm MaximallyReconvergesKHR : ExecutionModeOperand<6023, [Shader]>;
@@ -1919,7 +1937,7 @@ defm GenericCastToPtr : SpecConstantOpOperandsOperand<122, [], [Kernel]>;
defm PtrCastToGeneric : SpecConstantOpOperandsOperand<121, [], [Kernel]>;
defm Bitcast : SpecConstantOpOperandsOperand<124, [], []>;
defm QuantizeToF16 : SpecConstantOpOperandsOperand<116, [], [Shader]>;
-// Arithmetic
+// Arithmetic
defm SNegate : SpecConstantOpOperandsOperand<126, [], []>;
defm Not : SpecConstantOpOperandsOperand<200, [], []>;
defm IAdd : SpecConstantOpOperandsOperand<128, [], []>;
diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
index 5ba0356..2951a4b 100644
--- a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
@@ -244,7 +244,8 @@ static cl::opt<bool> SPVEnableNonSemanticDI(
cl::Optional, cl::init(false));
void SPIRVPassConfig::addPreEmitPass() {
- if (SPVEnableNonSemanticDI) {
+ if (SPVEnableNonSemanticDI ||
+ getSPIRVTargetMachine().getTargetTriple().getVendor() == Triple::AMD) {
addPass(createSPIRVEmitNonSemanticDIPass(&getTM<SPIRVTargetMachine>()));
}
}
diff --git a/llvm/lib/Target/Sparc/Sparc.td b/llvm/lib/Target/Sparc/Sparc.td
index 7137e5f..38b0508 100644
--- a/llvm/lib/Target/Sparc/Sparc.td
+++ b/llvm/lib/Target/Sparc/Sparc.td
@@ -95,6 +95,9 @@ def FeatureSoftFloat : SubtargetFeature<"soft-float", "UseSoftFloat", "true",
def TuneSlowRDPC : SubtargetFeature<"slow-rdpc", "HasSlowRDPC", "true",
"rd %pc, %XX is slow", [FeatureV9]>;
+def TuneNoPredictor : SubtargetFeature<"no-predictor", "HasNoPredictor", "true",
+ "Processor has no branch predictor, branches stall execution", []>;
+
//==== Features added predmoninantly for LEON subtarget support
include "LeonFeatures.td"
@@ -174,12 +177,15 @@ def : Proc<"ultrasparc3", [FeatureV9, FeatureV8Deprecated, FeatureVIS,
FeatureVIS2],
[TuneSlowRDPC]>;
def : Proc<"niagara", [FeatureV9, FeatureV8Deprecated, FeatureVIS,
- FeatureVIS2, FeatureUA2005]>;
+ FeatureVIS2, FeatureUA2005],
+ [TuneNoPredictor]>;
def : Proc<"niagara2", [FeatureV9, FeatureV8Deprecated, UsePopc,
- FeatureVIS, FeatureVIS2, FeatureUA2005]>;
+ FeatureVIS, FeatureVIS2, FeatureUA2005],
+ [TuneNoPredictor]>;
def : Proc<"niagara3", [FeatureV9, FeatureV8Deprecated, UsePopc,
FeatureVIS, FeatureVIS2, FeatureVIS3,
- FeatureUA2005, FeatureUA2007]>;
+ FeatureUA2005, FeatureUA2007],
+ [TuneNoPredictor]>;
def : Proc<"niagara4", [FeatureV9, FeatureV8Deprecated, UsePopc,
FeatureVIS, FeatureVIS2, FeatureVIS3,
FeatureUA2005, FeatureUA2007, FeatureOSA2011,
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
index cbb7db6..ae3c326 100644
--- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -2000,6 +2000,14 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+ // Some processors have no branch predictor and have pipelines longer than
+ // what can be covered by the delay slot. This results in a stall, so mark
+ // branches to be expensive on those processors.
+ setJumpIsExpensive(Subtarget->hasNoPredictor());
+ // The high cost of branching means that using conditional moves will
+ // still be profitable even if the condition is predictable.
+ PredictableSelectIsExpensive = !isJumpExpensive();
+
setMinFunctionAlignment(Align(4));
computeRegisterProperties(Subtarget->getRegisterInfo());
diff --git a/llvm/lib/Target/Target.cpp b/llvm/lib/Target/Target.cpp
index ec673ef..7387571 100644
--- a/llvm/lib/Target/Target.cpp
+++ b/llvm/lib/Target/Target.cpp
@@ -37,6 +37,7 @@ inline LLVMTargetLibraryInfoRef wrap(const TargetLibraryInfoImpl *P) {
void llvm::initializeTarget(PassRegistry &Registry) {
initializeTargetLibraryInfoWrapperPassPass(Registry);
+ initializeRuntimeLibraryInfoWrapperPass(Registry);
initializeTargetTransformInfoWrapperPassPass(Registry);
}
diff --git a/llvm/lib/Target/WebAssembly/CMakeLists.txt b/llvm/lib/Target/WebAssembly/CMakeLists.txt
index 1e83cbe..17df119 100644
--- a/llvm/lib/Target/WebAssembly/CMakeLists.txt
+++ b/llvm/lib/Target/WebAssembly/CMakeLists.txt
@@ -10,6 +10,7 @@ tablegen(LLVM WebAssemblyGenFastISel.inc -gen-fast-isel)
tablegen(LLVM WebAssemblyGenInstrInfo.inc -gen-instr-info)
tablegen(LLVM WebAssemblyGenMCCodeEmitter.inc -gen-emitter)
tablegen(LLVM WebAssemblyGenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM WebAssemblyGenSDNodeInfo.inc -gen-sd-node-info)
tablegen(LLVM WebAssemblyGenSubtargetInfo.inc -gen-subtarget)
add_public_tablegen_target(WebAssemblyCommonTableGen)
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index 2666342..66ed8b0 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -46,7 +46,7 @@ class WebAssemblyFastISel final : public FastISel {
// All possible address modes.
class Address {
public:
- using BaseKind = enum { RegBase, FrameIndexBase };
+ enum BaseKind { RegBase, FrameIndexBase };
private:
BaseKind Kind = RegBase;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
index 37a3457..9fef3e6 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
@@ -24,6 +24,7 @@
#include "WebAssembly.h"
#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Operator.h"
@@ -114,6 +115,7 @@ static Function *createWrapper(Function *F, FunctionType *Ty) {
Wrapper->setAttributes(F->getAttributes());
BasicBlock *BB = BasicBlock::Create(M->getContext(), "body", Wrapper);
const DataLayout &DL = BB->getDataLayout();
+ IRBuilder<> Builder(BB);
// Determine what arguments to pass.
SmallVector<Value *, 4> Args;
@@ -140,10 +142,7 @@ static Function *createWrapper(Function *F, FunctionType *Ty) {
Args.push_back(&*AI);
} else {
if (CastInst::isBitOrNoopPointerCastable(ArgType, ParamType, DL)) {
- Instruction *PtrCast =
- CastInst::CreateBitOrPointerCast(AI, ParamType, "cast");
- PtrCast->insertInto(BB, BB->end());
- Args.push_back(PtrCast);
+ Args.push_back(Builder.CreateBitOrPointerCast(AI, ParamType, "cast"));
} else if (ArgType->isStructTy() || ParamType->isStructTy()) {
LLVM_DEBUG(dbgs() << "createWrapper: struct param type in bitcast: "
<< F->getName() << "\n");
@@ -166,24 +165,19 @@ static Function *createWrapper(Function *F, FunctionType *Ty) {
for (; AI != AE; ++AI)
Args.push_back(&*AI);
- CallInst *Call = CallInst::Create(F, Args, "", BB);
+ CallInst *Call = Builder.CreateCall(F, Args);
- Type *ExpectedRtnType = F->getFunctionType()->getReturnType();
- Type *RtnType = Ty->getReturnType();
// Determine what value to return.
if (RtnType->isVoidTy()) {
- ReturnInst::Create(M->getContext(), BB);
+ Builder.CreateRetVoid();
} else if (ExpectedRtnType->isVoidTy()) {
LLVM_DEBUG(dbgs() << "Creating dummy return: " << *RtnType << "\n");
- ReturnInst::Create(M->getContext(), PoisonValue::get(RtnType), BB);
+ Builder.CreateRet(PoisonValue::get(RtnType));
} else if (RtnType == ExpectedRtnType) {
- ReturnInst::Create(M->getContext(), Call, BB);
+ Builder.CreateRet(Call);
} else if (CastInst::isBitOrNoopPointerCastable(ExpectedRtnType, RtnType,
DL)) {
- Instruction *Cast =
- CastInst::CreateBitOrPointerCast(Call, RtnType, "cast");
- Cast->insertInto(BB, BB->end());
- ReturnInst::Create(M->getContext(), Cast, BB);
+ Builder.CreateRet(Builder.CreateBitOrPointerCast(Call, RtnType, "cast"));
} else if (RtnType->isStructTy() || ExpectedRtnType->isStructTy()) {
LLVM_DEBUG(dbgs() << "createWrapper: struct return type in bitcast: "
<< F->getName() << "\n");
@@ -203,9 +197,8 @@ static Function *createWrapper(Function *F, FunctionType *Ty) {
Wrapper = Function::Create(Ty, Function::PrivateLinkage,
F->getName() + "_bitcast_invalid", M);
Wrapper->setAttributes(F->getAttributes());
- BasicBlock *BB = BasicBlock::Create(M->getContext(), "body", Wrapper);
- new UnreachableInst(M->getContext(), BB);
- Wrapper->setName(F->getName() + "_bitcast_invalid");
+ IRBuilder<> Builder(BasicBlock::Create(M->getContext(), "body", Wrapper));
+ Builder.CreateUnreachable();
} else if (!WrapperNeeded) {
LLVM_DEBUG(dbgs() << "createWrapper: no wrapper needed: " << F->getName()
<< "\n");
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def b/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
deleted file mode 100644
index 23108e4..0000000
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
+++ /dev/null
@@ -1,64 +0,0 @@
-//- WebAssemblyISD.def - WebAssembly ISD ---------------------------*- C++ -*-//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file describes the various WebAssembly ISD node types.
-///
-//===----------------------------------------------------------------------===//
-
-// NOTE: NO INCLUDE GUARD DESIRED!
-
-HANDLE_NODETYPE(CALL)
-HANDLE_NODETYPE(RET_CALL)
-HANDLE_NODETYPE(RETURN)
-HANDLE_NODETYPE(ARGUMENT)
-HANDLE_NODETYPE(LOCAL_GET)
-HANDLE_NODETYPE(LOCAL_SET)
-// A wrapper node for TargetExternalSymbol, TargetGlobalAddress, and MCSymbol
-HANDLE_NODETYPE(Wrapper)
-// A special node for TargetGlobalAddress used in PIC code for
-// __memory_base/__table_base relative access.
-HANDLE_NODETYPE(WrapperREL)
-HANDLE_NODETYPE(BR_IF)
-HANDLE_NODETYPE(BR_TABLE)
-HANDLE_NODETYPE(DOT)
-HANDLE_NODETYPE(EXT_ADD_PAIRWISE_U)
-HANDLE_NODETYPE(EXT_ADD_PAIRWISE_S)
-HANDLE_NODETYPE(SHUFFLE)
-HANDLE_NODETYPE(SWIZZLE)
-HANDLE_NODETYPE(VEC_SHL)
-HANDLE_NODETYPE(VEC_SHR_S)
-HANDLE_NODETYPE(VEC_SHR_U)
-HANDLE_NODETYPE(NARROW_U)
-HANDLE_NODETYPE(EXTEND_LOW_S)
-HANDLE_NODETYPE(EXTEND_LOW_U)
-HANDLE_NODETYPE(EXTEND_HIGH_S)
-HANDLE_NODETYPE(EXTEND_HIGH_U)
-HANDLE_NODETYPE(CONVERT_LOW_S)
-HANDLE_NODETYPE(CONVERT_LOW_U)
-HANDLE_NODETYPE(PROMOTE_LOW)
-HANDLE_NODETYPE(TRUNC_SAT_ZERO_S)
-HANDLE_NODETYPE(TRUNC_SAT_ZERO_U)
-HANDLE_NODETYPE(DEMOTE_ZERO)
-HANDLE_NODETYPE(I64_ADD128)
-HANDLE_NODETYPE(I64_SUB128)
-HANDLE_NODETYPE(I64_MUL_WIDE_S)
-HANDLE_NODETYPE(I64_MUL_WIDE_U)
-
-// Memory intrinsics
-HANDLE_NODETYPE(GLOBAL_GET)
-HANDLE_NODETYPE(GLOBAL_SET)
-HANDLE_NODETYPE(TABLE_GET)
-HANDLE_NODETYPE(TABLE_SET)
-
-// Bulk memory instructions. These follow LLVM's expected semantics of
-// supporting out-of-bounds pointers if the length is zero, by inserting
-// a branch around Wasm's `memory.copy` and `memory.fill`, which would
-// otherwise trap.
-HANDLE_NODETYPE(MEMCPY)
-HANDLE_NODETYPE(MEMSET)
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 7ec463b..fc6c290 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -216,7 +216,8 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
// Combine fp_to_{s,u}int_sat or fp_round of concat_vectors or vice versa
// into conversion ops
setTargetDAGCombine({ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT,
- ISD::FP_ROUND, ISD::CONCAT_VECTORS});
+ ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_ROUND,
+ ISD::CONCAT_VECTORS});
setTargetDAGCombine(ISD::TRUNCATE);
@@ -942,20 +943,6 @@ MachineBasicBlock *WebAssemblyTargetLowering::EmitInstrWithCustomInserter(
}
}
-const char *
-WebAssemblyTargetLowering::getTargetNodeName(unsigned Opcode) const {
- switch (static_cast<WebAssemblyISD::NodeType>(Opcode)) {
- case WebAssemblyISD::FIRST_NUMBER:
- break;
-#define HANDLE_NODETYPE(NODE) \
- case WebAssemblyISD::NODE: \
- return "WebAssemblyISD::" #NODE;
-#include "WebAssemblyISD.def"
-#undef HANDLE_NODETYPE
- }
- return nullptr;
-}
-
std::pair<unsigned, const TargetRegisterClass *>
WebAssemblyTargetLowering::getRegForInlineAsmConstraint(
const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
@@ -1830,11 +1817,8 @@ SDValue WebAssemblyTargetLowering::LowerLoad(SDValue Op,
SDValue Idx = DAG.getTargetConstant(*Local, Base, MVT::i32);
EVT LocalVT = LN->getValueType(0);
- SDValue LocalGet = DAG.getNode(WebAssemblyISD::LOCAL_GET, DL, LocalVT,
- {LN->getChain(), Idx});
- SDValue Result = DAG.getMergeValues({LocalGet, LN->getChain()}, DL);
- assert(Result->getNumValues() == 2 && "Loads must carry a chain!");
- return Result;
+ return DAG.getNode(WebAssemblyISD::LOCAL_GET, DL, {LocalVT, MVT::Other},
+ {LN->getChain(), Idx});
}
if (WebAssembly::isWasmVarAddressSpace(LN->getAddressSpace()))
@@ -3597,6 +3581,64 @@ static SDValue performMulCombine(SDNode *N,
}
}
+SDValue DoubleVectorWidth(SDValue In, unsigned RequiredNumElems,
+ SelectionDAG &DAG) {
+ SDLoc DL(In);
+ LLVMContext &Ctx = *DAG.getContext();
+ EVT InVT = In.getValueType();
+ unsigned NumElems = InVT.getVectorNumElements() * 2;
+ EVT OutVT = EVT::getVectorVT(Ctx, InVT.getVectorElementType(), NumElems);
+ SDValue Concat =
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, In, DAG.getPOISON(InVT));
+ if (NumElems < RequiredNumElems) {
+ return DoubleVectorWidth(Concat, RequiredNumElems, DAG);
+ }
+ return Concat;
+}
+
+SDValue performConvertFPCombine(SDNode *N, SelectionDAG &DAG) {
+ EVT OutVT = N->getValueType(0);
+ if (!OutVT.isVector())
+ return SDValue();
+
+ EVT OutElTy = OutVT.getVectorElementType();
+ if (OutElTy != MVT::i8 && OutElTy != MVT::i16)
+ return SDValue();
+
+ unsigned NumElems = OutVT.getVectorNumElements();
+ if (!isPowerOf2_32(NumElems))
+ return SDValue();
+
+ EVT FPVT = N->getOperand(0)->getValueType(0);
+ if (FPVT.getVectorElementType() != MVT::f32)
+ return SDValue();
+
+ SDLoc DL(N);
+
+ // First, convert to i32.
+ LLVMContext &Ctx = *DAG.getContext();
+ EVT IntVT = EVT::getVectorVT(Ctx, MVT::i32, NumElems);
+ SDValue ToInt = DAG.getNode(N->getOpcode(), DL, IntVT, N->getOperand(0));
+ APInt Mask = APInt::getLowBitsSet(IntVT.getScalarSizeInBits(),
+ OutVT.getScalarSizeInBits());
+ // Mask out the top MSBs.
+ SDValue Masked =
+ DAG.getNode(ISD::AND, DL, IntVT, ToInt, DAG.getConstant(Mask, DL, IntVT));
+
+ if (OutVT.getSizeInBits() < 128) {
+ // Create a wide enough vector that we can use narrow.
+ EVT NarrowedVT = OutElTy == MVT::i8 ? MVT::v16i8 : MVT::v8i16;
+ unsigned NumRequiredElems = NarrowedVT.getVectorNumElements();
+ SDValue WideVector = DoubleVectorWidth(Masked, NumRequiredElems, DAG);
+ SDValue Trunc = truncateVectorWithNARROW(NarrowedVT, WideVector, DL, DAG);
+ return DAG.getBitcast(
+ OutVT, extractSubVector(Trunc, 0, DAG, DL, OutVT.getSizeInBits()));
+ } else {
+ return truncateVectorWithNARROW(OutVT, Masked, DL, DAG);
+ }
+ return SDValue();
+}
+
SDValue
WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
@@ -3623,6 +3665,9 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::FP_ROUND:
case ISD::CONCAT_VECTORS:
return performVectorTruncZeroCombine(N, DCI);
+ case ISD::FP_TO_SINT:
+ case ISD::FP_TO_UINT:
+ return performConvertFPCombine(N, DCI.DAG);
case ISD::TRUNCATE:
return performTruncateCombine(N, DCI);
case ISD::INTRINSIC_WO_CHAIN:
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index 472ec67..f705298 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -19,17 +19,6 @@
namespace llvm {
-namespace WebAssemblyISD {
-
-enum NodeType : unsigned {
- FIRST_NUMBER = ISD::BUILTIN_OP_END,
-#define HANDLE_NODETYPE(NODE) NODE,
-#include "WebAssemblyISD.def"
-#undef HANDLE_NODETYPE
-};
-
-} // end namespace WebAssemblyISD
-
class WebAssemblySubtarget;
class WebAssemblyTargetLowering final : public TargetLowering {
@@ -53,7 +42,6 @@ private:
MachineBasicBlock *
EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *MBB) const override;
- const char *getTargetNodeName(unsigned Opcode) const override;
std::pair<unsigned, const TargetRegisterClass *>
getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
StringRef Constraint, MVT VT) const override;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td
index fc82e5b..304c4f3 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td
@@ -41,6 +41,11 @@ defm REF_TEST_FUNCREF : I<(outs I32:$res), (ins TypeIndex:$type, FUNCREF:$ref),
"ref.test\t$type, $ref", "ref.test $type", 0xfb14>,
Requires<[HasGC]>;
+defm REF_FUNC : I<(outs FUNCREF:$res), (ins function32_op:$func),
+ (outs), (ins function32_op:$func), [],
+ "ref.func\t$func", "ref.func $func", 0xd2>,
+ Requires<[HasReferenceTypes]>;
+
defm "" : REF_I<FUNCREF, funcref, "func">;
defm "" : REF_I<EXTERNREF, externref, "extern">;
defm "" : REF_I<EXNREF, exnref, "exn">;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
index 45b0e7d..f3c236c 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
@@ -532,13 +532,19 @@ struct StaticLibcallNameMap {
// FIXME: This is broken if there are ever different triples compiled with
// different libcalls.
RTLIB::RuntimeLibcallsInfo RTCI(TT);
- for (RTLIB::Libcall LC : RTLIB::libcalls()) {
- StringRef NameLibcall = RTCI.getLibcallName(LC);
- if (!NameLibcall.empty() &&
- getRuntimeLibcallSignatures().Table[LC] != unsupported) {
- assert(!Map.contains(NameLibcall) &&
- "duplicate libcall names in name map");
- Map[NameLibcall] = LC;
+
+ ArrayRef<RuntimeLibcallSignature> Table =
+ getRuntimeLibcallSignatures().Table;
+ for (RTLIB::LibcallImpl Impl : RTLIB::libcall_impls()) {
+ if (!RTCI.isAvailable(Impl))
+ continue;
+ RTLIB::Libcall LC = RTLIB::RuntimeLibcallsInfo::getLibcallFromImpl(Impl);
+ if (Table[LC] != unsupported) {
+ StringRef NameLibcall =
+ RTLIB::RuntimeLibcallsInfo::getLibcallImplName(Impl);
+ // FIXME: Map should be to LibcallImpl
+ if (!Map.insert({NameLibcall, LC}).second)
+ llvm_unreachable("duplicate libcall names in name map");
}
}
}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
index 2673c81..cf5cc41 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
@@ -11,23 +11,31 @@
///
//===----------------------------------------------------------------------===//
+#include "WebAssemblySelectionDAGInfo.h"
#include "WebAssemblyTargetMachine.h"
+
+#define GET_SDNODE_DESC
+#include "WebAssemblyGenSDNodeInfo.inc"
+
using namespace llvm;
#define DEBUG_TYPE "wasm-selectiondag-info"
+WebAssemblySelectionDAGInfo::WebAssemblySelectionDAGInfo()
+ : SelectionDAGGenTargetInfo(WebAssemblyGenSDNodeInfo) {}
+
WebAssemblySelectionDAGInfo::~WebAssemblySelectionDAGInfo() = default; // anchor
-bool WebAssemblySelectionDAGInfo::isTargetMemoryOpcode(unsigned Opcode) const {
+const char *
+WebAssemblySelectionDAGInfo::getTargetNodeName(unsigned Opcode) const {
switch (static_cast<WebAssemblyISD::NodeType>(Opcode)) {
- default:
- return false;
- case WebAssemblyISD::GLOBAL_GET:
- case WebAssemblyISD::GLOBAL_SET:
- case WebAssemblyISD::TABLE_GET:
- case WebAssemblyISD::TABLE_SET:
- return true;
+ case WebAssemblyISD::CALL:
+ return "WebAssemblyISD::CALL";
+ case WebAssemblyISD::RET_CALL:
+ return "WebAssemblyISD::RET_CALL";
}
+
+ return SelectionDAGGenTargetInfo::getTargetNodeName(Opcode);
}
SDValue WebAssemblySelectionDAGInfo::EmitTargetCodeForMemcpy(
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
index 69c9af0..8775f49 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
@@ -17,13 +17,26 @@
#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#define GET_SDNODE_ENUM
+#include "WebAssemblyGenSDNodeInfo.inc"
+
namespace llvm {
+namespace WebAssemblyISD {
+
+enum NodeType : unsigned {
+ CALL = GENERATED_OPCODE_END,
+ RET_CALL,
+};
-class WebAssemblySelectionDAGInfo final : public SelectionDAGTargetInfo {
+} // namespace WebAssemblyISD
+
+class WebAssemblySelectionDAGInfo final : public SelectionDAGGenTargetInfo {
public:
+ WebAssemblySelectionDAGInfo();
+
~WebAssemblySelectionDAGInfo() override;
- bool isTargetMemoryOpcode(unsigned Opcode) const override;
+ const char *getTargetNodeName(unsigned Opcode) const override;
SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
SDValue Chain, SDValue Op1, SDValue Op2,
diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h
index 51b540a..fa23656 100644
--- a/llvm/lib/Target/X86/X86.h
+++ b/llvm/lib/Target/X86/X86.h
@@ -158,7 +158,16 @@ FunctionPass *createX86InsertX87waitPass();
/// This pass optimizes arithmetic based on knowledge that is only used by
/// a reduction sequence and is therefore safe to reassociate in interesting
/// ways.
-FunctionPass *createX86PartialReductionPass();
+class X86PartialReductionPass : public PassInfoMixin<X86PartialReductionPass> {
+private:
+ const X86TargetMachine *TM;
+
+public:
+ X86PartialReductionPass(const X86TargetMachine *TM) : TM(TM) {}
+ PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+};
+
+FunctionPass *createX86PartialReductionLegacyPass();
/// // Analyzes and emits pseudos to support Win x64 Unwind V2.
FunctionPass *createX86WinEHUnwindV2Pass();
@@ -179,7 +188,18 @@ FunctionPass *createX86LowerAMXTypeLegacyPass();
/// The pass transforms amx intrinsics to scalar operation if the function has
/// optnone attribute or it is O0.
-FunctionPass *createX86LowerAMXIntrinsicsPass();
+class X86LowerAMXIntrinsicsPass
+ : public PassInfoMixin<X86LowerAMXIntrinsicsPass> {
+private:
+ const TargetMachine *TM;
+
+public:
+ X86LowerAMXIntrinsicsPass(const TargetMachine *TM) : TM(TM) {}
+ PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+ static bool isRequired() { return true; }
+};
+
+FunctionPass *createX86LowerAMXIntrinsicsLegacyPass();
InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM,
const X86Subtarget &,
@@ -220,7 +240,7 @@ void initializeX86LowerAMXIntrinsicsLegacyPassPass(PassRegistry &);
void initializeX86LowerAMXTypeLegacyPassPass(PassRegistry &);
void initializeX86LowerTileCopyPass(PassRegistry &);
void initializeX86OptimizeLEAPassPass(PassRegistry &);
-void initializeX86PartialReductionPass(PassRegistry &);
+void initializeX86PartialReductionLegacyPass(PassRegistry &);
void initializeX86PreTileConfigPass(PassRegistry &);
void initializeX86ReturnThunksPass(PassRegistry &);
void initializeX86SpeculativeExecutionSideEffectSuppressionPass(PassRegistry &);
diff --git a/llvm/lib/Target/X86/X86CompressEVEX.cpp b/llvm/lib/Target/X86/X86CompressEVEX.cpp
index c0c7f5a..ddbd10d 100644
--- a/llvm/lib/Target/X86/X86CompressEVEX.cpp
+++ b/llvm/lib/Target/X86/X86CompressEVEX.cpp
@@ -272,7 +272,7 @@ static bool CompressEVEXImpl(MachineInstr &MI, MachineBasicBlock &MBB,
const MachineOperand &Src2 = MI.getOperand(2);
bool Is32BitReg = Opc == X86::ADD32ri_ND || Opc == X86::ADD32rr_ND;
const MCInstrDesc &NewDesc =
- ST.getInstrInfo()->get(Is32BitReg ? X86::LEA32r : X86::LEA64r);
+ ST.getInstrInfo()->get(Is32BitReg ? X86::LEA64_32r : X86::LEA64r);
if (Is32BitReg)
Src1 = getX86SubSuperRegister(Src1, 64);
MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), NewDesc, Dst)
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index d4418c8..6c16fcfb 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -4728,9 +4728,9 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
auto tryPeelOuterNotWrappingLogic = [&](SDNode *Op) {
if (Op->getOpcode() == ISD::XOR && Op->hasOneUse() &&
ISD::isBuildVectorAllOnes(Op->getOperand(1).getNode())) {
- SDValue InnerOp = Op->getOperand(0);
+ SDValue InnerOp = getFoldableLogicOp(Op->getOperand(0));
- if (!getFoldableLogicOp(InnerOp))
+ if (!InnerOp)
return SDValue();
N0 = InnerOp.getOperand(0);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b97b508..168e041 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2572,8 +2572,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
// Combine sin / cos into _sincos_stret if it is available.
- setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
- setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
+ setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
if (Subtarget.isTargetWin64()) {
setOperationAction(ISD::SDIV, MVT::i128, Custom);
@@ -30908,6 +30908,63 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
}
+ if (VT == MVT::v64i8 && Subtarget.canExtendTo512BW()) {
+ // On AVX512BW, we can use variable 16-bit shifts to implement variable
+ // 8-bit shifts. For this, we split the input into two vectors, RLo and RHi.
+ // The i-th lane of RLo contains the (2*i)-th lane of R, and the i-th lane
+ // of RHi contains the (2*i+1)-th lane of R. After shifting, these vectors
+ // can efficiently be merged together using a masked move.
+ MVT ExtVT = MVT::v32i16;
+
+ SDValue RLo, RHi;
+ // Isolate lower and upper lanes of Amt by masking odd lanes in AmtLo and
+ // right shifting AmtHi.
+ SDValue AmtLo = DAG.getNode(ISD::AND, dl, ExtVT, DAG.getBitcast(ExtVT, Amt),
+ DAG.getConstant(0x00ff, dl, ExtVT));
+ SDValue AmtHi = getTargetVShiftByConstNode(
+ X86ISD::VSRLI, dl, ExtVT, DAG.getBitcast(ExtVT, Amt), 8, DAG);
+ switch (Opc) {
+ case ISD::SHL:
+ // Because we shift left, no bits from the high half can influence the low
+ // half, so we don't need to mask RLo. We do however need to mask RHi, to
+ // prevent high bits of an even lane overflowing into low bits of an odd
+ // lane.
+ RLo = DAG.getBitcast(ExtVT, R);
+ RHi = DAG.getNode(ISD::AND, dl, ExtVT, RLo,
+ DAG.getConstant(0xff00, dl, ExtVT));
+ break;
+ case ISD::SRL:
+ // Same idea as above, but this time we need to make sure no low bits of
+ // an odd lane can overflow into high bits of an even lane.
+ RHi = DAG.getBitcast(ExtVT, R);
+ RLo = DAG.getNode(ISD::AND, dl, ExtVT, RHi,
+ DAG.getConstant(0x00ff, dl, ExtVT));
+ break;
+ case ISD::SRA:
+ // For arithmetic right shifts, we want to sign extend each even lane of R
+ // such that the upper half of the corresponding lane of RLo is 0 or -1
+ // depending on the sign bit of the original lane. We do this using 2
+ // immediate shifts.
+ RHi = DAG.getBitcast(ExtVT, R);
+ RLo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, RHi, 8, DAG);
+ RLo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExtVT, RLo, 8, DAG);
+ break;
+ default:
+ llvm_unreachable("Unexpected Shift Op");
+ }
+
+ SDValue ShiftedLo =
+ DAG.getBitcast(VT, DAG.getNode(Opc, dl, ExtVT, RLo, AmtLo));
+ SDValue ShiftedHi =
+ DAG.getBitcast(VT, DAG.getNode(Opc, dl, ExtVT, RHi, AmtHi));
+
+ // To merge the shifted vectors back together, we select even lanes
+ // from ShiftedLo and odd lanes from ShiftedHi.
+ SDValue SelectMask = DAG.getBitcast(
+ MVT::v64i1, DAG.getConstant(0x5555555555555555, dl, MVT::i64));
+ return DAG.getSelect(dl, VT, SelectMask, ShiftedLo, ShiftedHi);
+ }
+
if (VT == MVT::v16i8 ||
(VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
(VT == MVT::v64i8 && Subtarget.hasBWI())) {
@@ -33004,61 +33061,6 @@ static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) {
return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
}
-static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- SDValue Arg = Op.getOperand(0);
- EVT ArgVT = Arg.getValueType();
- bool isF64 = ArgVT == MVT::f64;
-
- RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
- const char *LibcallName = TLI.getLibcallName(LC);
- if (!LibcallName)
- return SDValue();
-
- assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
-
- // For MacOSX, we want to call an alternative entry point: __sincos_stret,
- // which returns the values as { float, float } (in XMM0) or
- // { double, double } (which is returned in XMM0, XMM1).
- SDLoc dl(Op);
- Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
-
- TargetLowering::ArgListTy Args;
- Args.emplace_back(Arg, ArgTy);
-
- // Only optimize x86_64 for now. i386 is a bit messy. For f32,
- // the small struct {f32, f32} is returned in (eax, edx). For f64,
- // the results are returned via SRet in memory.
- SDValue Callee =
- DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
-
- Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
- : (Type *)FixedVectorType::get(ArgTy, 2);
-
- TargetLowering::CallLoweringInfo CLI(DAG);
- CLI.setDebugLoc(dl)
- .setChain(DAG.getEntryNode())
- .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args))
- .setIsPostTypeLegalization();
-
- std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
-
- if (isF64)
- // Returned in xmm0 and xmm1.
- return CallResult.first;
-
- // Returned in bits 0:31 and 32:64 xmm0.
- SDValue SinVal =
- DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
- DAG.getVectorIdxConstant(0, dl));
- SDValue CosVal =
- DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
- DAG.getVectorIdxConstant(1, dl));
- SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
- return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
-}
-
/// Widen a vector input to a vector of NVT. The
/// input vector must have the same element type as NVT.
static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
@@ -33663,7 +33665,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::ABDS:
case ISD::ABDU: return LowerABD(Op, Subtarget, DAG);
case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG);
- case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
@@ -53349,40 +53350,44 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
}
// Look for a RMW operation that only touches one bit of a larger than legal
-// type and fold it to a BTC/BTR/BTS pattern acting on a single i32 sub value.
+// type and fold it to a BTC/BTR/BTS or bit insertion pattern acting on a single
+// i32 sub value.
static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
using namespace SDPatternMatch;
-
- // Only handle normal stores and its chain was a matching normal load.
- auto *Ld = dyn_cast<LoadSDNode>(St->getChain());
- if (!ISD::isNormalStore(St) || !St->isSimple() || !Ld ||
- !ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
- Ld->getBasePtr() != St->getBasePtr() ||
- Ld->getOffset() != St->getOffset())
- return SDValue();
-
- SDValue LoadVal(Ld, 0);
SDValue StoredVal = St->getValue();
EVT VT = StoredVal.getValueType();
- // Only narrow larger than legal scalar integers.
- if (!VT.isScalarInteger() ||
+ // Only narrow normal stores of larger than legal scalar integers.
+ if (!ISD::isNormalStore(St) || !St->isSimple() || !VT.isScalarInteger() ||
VT.getSizeInBits() <= (Subtarget.is64Bit() ? 64 : 32))
return SDValue();
// BTR: X & ~(1 << ShAmt)
// BTS: X | (1 << ShAmt)
// BTC: X ^ (1 << ShAmt)
- SDValue ShAmt;
- if (!StoredVal.hasOneUse() ||
- !(sd_match(StoredVal, m_And(m_Specific(LoadVal),
+ //
+ // BitInsert: (X & ~(1 << ShAmt)) | (InsertBit << ShAmt)
+ SDValue SrcVal, InsertBit, ShAmt;
+ if (!(sd_match(StoredVal, m_And(m_Value(SrcVal),
m_Not(m_Shl(m_One(), m_Value(ShAmt))))) ||
sd_match(StoredVal,
- m_Or(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
+ m_Or(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
sd_match(StoredVal,
- m_Xor(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt))))))
+ m_Xor(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
+ sd_match(
+ StoredVal,
+ m_Or(m_And(m_Value(SrcVal), m_Not(m_Shl(m_One(), m_Value(ShAmt)))),
+ m_Shl(m_Value(InsertBit), m_Deferred(ShAmt))))))
+ return SDValue();
+
+ // SrcVal must be a matching normal load further up the chain.
+ auto *Ld = dyn_cast<LoadSDNode>(SrcVal);
+ if (!Ld || !ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
+ Ld->getBasePtr() != St->getBasePtr() ||
+ Ld->getOffset() != St->getOffset() ||
+ !St->getChain().reachesChainWithoutSideEffects(SDValue(Ld, 1)))
return SDValue();
// Ensure the shift amount is in bounds.
@@ -53390,6 +53395,13 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
if (KnownAmt.getMaxValue().uge(VT.getSizeInBits()))
return SDValue();
+ // If we're inserting a bit then it must be the LSB.
+ if (InsertBit) {
+ KnownBits KnownInsert = DAG.computeKnownBits(InsertBit);
+ if (KnownInsert.countMinLeadingZeros() < (VT.getSizeInBits() - 1))
+ return SDValue();
+ }
+
// Split the shift into an alignment shift that moves the active i32 block to
// the bottom bits for truncation and a modulo shift that can act on the i32.
EVT AmtVT = ShAmt.getValueType();
@@ -53397,6 +53409,7 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
DAG.getSignedConstant(-32LL, DL, AmtVT));
SDValue ModuloAmt =
DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, DAG.getConstant(31, DL, AmtVT));
+ ModuloAmt = DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8);
// Compute the byte offset for the i32 block that is changed by the RMW.
// combineTruncate will adjust the load for us in a similar way.
@@ -53408,18 +53421,39 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
SDNodeFlags::NoUnsignedWrap);
// Reconstruct the BTC/BTR/BTS pattern for the i32 block and store.
- SDValue X = DAG.getNode(ISD::SRL, DL, VT, LoadVal, AlignAmt);
+ SDValue X = DAG.getNode(ISD::SRL, DL, VT, SrcVal, AlignAmt);
X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
- SDValue Mask =
- DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(1, DL, MVT::i32),
- DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8));
- if (StoredVal.getOpcode() == ISD::AND)
- Mask = DAG.getNOT(DL, Mask, MVT::i32);
+ SDValue Mask = DAG.getNode(ISD::SHL, DL, MVT::i32,
+ DAG.getConstant(1, DL, MVT::i32), ModuloAmt);
- SDValue Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask);
- return DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(),
- Align(), St->getMemOperand()->getFlags());
+ SDValue Res;
+ if (InsertBit) {
+ SDValue BitMask =
+ DAG.getNode(ISD::SHL, DL, MVT::i32,
+ DAG.getZExtOrTrunc(InsertBit, DL, MVT::i32), ModuloAmt);
+ Res =
+ DAG.getNode(ISD::AND, DL, MVT::i32, X, DAG.getNOT(DL, Mask, MVT::i32));
+ Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, BitMask);
+ } else {
+ if (StoredVal.getOpcode() == ISD::AND)
+ Mask = DAG.getNOT(DL, Mask, MVT::i32);
+ Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask);
+ }
+
+ SDValue NewStore =
+ DAG.getStore(St->getChain(), DL, Res, NewPtr,
+ MachinePointerInfo(St->getPointerInfo().getAddrSpace()),
+ Align(), St->getMemOperand()->getFlags());
+
+ // If there are other uses of StoredVal, replace with a new load of the
+ // whole (updated) value.
+ if (!StoredVal.hasOneUse()) {
+ SDValue NewLoad =
+ DAG.getLoad(VT, DL, NewStore, Ld->getBasePtr(), Ld->getMemOperand());
+ DAG.ReplaceAllUsesWith(StoredVal, NewLoad);
+ }
+ return NewStore;
}
static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
@@ -54606,7 +54640,8 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
SDValue NewPtr = DAG.getMemBasePlusOffset(
Ld->getBasePtr(), PtrByteOfs, DL, SDNodeFlags::NoUnsignedWrap);
SDValue NewLoad =
- DAG.getLoad(VT, DL, Ld->getChain(), NewPtr, Ld->getPointerInfo(),
+ DAG.getLoad(VT, DL, Ld->getChain(), NewPtr,
+ MachinePointerInfo(Ld->getPointerInfo().getAddrSpace()),
Align(), Ld->getMemOperand()->getFlags());
DAG.makeEquivalentMemoryOrdering(Ld, NewLoad);
return NewLoad;
diff --git a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
index 7f33939..662aec2 100644
--- a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
+++ b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
@@ -23,12 +23,15 @@
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Analysis.h"
#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
@@ -40,7 +43,7 @@
using namespace llvm;
using namespace PatternMatch;
-#define DEBUG_TYPE "lower-amx-intrinsics"
+#define DEBUG_TYPE "x86-lower-amx-intrinsics"
#ifndef NDEBUG
static bool isV256I32Ty(Type *Ty) {
@@ -627,6 +630,37 @@ bool X86LowerAMXIntrinsics::visit() {
}
namespace {
+bool shouldRunLowerAMXIntrinsics(const Function &F, const TargetMachine *TM) {
+ return X86ScalarizeAMX && (F.hasFnAttribute(Attribute::OptimizeNone) ||
+ TM->getOptLevel() == CodeGenOptLevel::None);
+}
+
+bool runLowerAMXIntrinsics(Function &F, DominatorTree *DT, LoopInfo *LI) {
+ DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
+
+ X86LowerAMXIntrinsics LAT(F, DTU, LI);
+ return LAT.visit();
+}
+} // namespace
+
+PreservedAnalyses X86LowerAMXIntrinsicsPass::run(Function &F,
+ FunctionAnalysisManager &FAM) {
+ if (!shouldRunLowerAMXIntrinsics(F, TM))
+ return PreservedAnalyses::all();
+
+ DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+ LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
+ bool Changed = runLowerAMXIntrinsics(F, &DT, &LI);
+ if (!Changed)
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA = PreservedAnalyses::none();
+ PA.preserve<DominatorTreeAnalysis>();
+ PA.preserve<LoopAnalysis>();
+ return PA;
+}
+
+namespace {
class X86LowerAMXIntrinsicsLegacyPass : public FunctionPass {
public:
static char ID;
@@ -634,21 +668,15 @@ public:
X86LowerAMXIntrinsicsLegacyPass() : FunctionPass(ID) {}
bool runOnFunction(Function &F) override {
- if (!X86ScalarizeAMX)
- return false;
TargetMachine *TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
- if (!F.hasFnAttribute(Attribute::OptimizeNone) &&
- TM->getOptLevel() != CodeGenOptLevel::None)
+ if (!shouldRunLowerAMXIntrinsics(F, TM))
return false;
auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
- DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
-
- X86LowerAMXIntrinsics LAT(F, DTU, LI);
- return LAT.visit();
+ return runLowerAMXIntrinsics(F, DT, LI);
}
StringRef getPassName() const override { return "Lower AMX intrinsics"; }
@@ -668,6 +696,6 @@ INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
INITIALIZE_PASS_END(X86LowerAMXIntrinsicsLegacyPass, DEBUG_TYPE, PassName,
false, false)
-FunctionPass *llvm::createX86LowerAMXIntrinsicsPass() {
+FunctionPass *llvm::createX86LowerAMXIntrinsicsLegacyPass() {
return new X86LowerAMXIntrinsicsLegacyPass();
}
diff --git a/llvm/lib/Target/X86/X86PartialReduction.cpp b/llvm/lib/Target/X86/X86PartialReduction.cpp
index a25e4e0..898c83c 100644
--- a/llvm/lib/Target/X86/X86PartialReduction.cpp
+++ b/llvm/lib/Target/X86/X86PartialReduction.cpp
@@ -16,10 +16,12 @@
#include "X86TargetMachine.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Analysis.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Pass.h"
#include "llvm/Support/KnownBits.h"
@@ -30,39 +32,44 @@ using namespace llvm;
namespace {
-class X86PartialReduction : public FunctionPass {
+class X86PartialReduction {
+ const X86TargetMachine *TM;
const DataLayout *DL = nullptr;
const X86Subtarget *ST = nullptr;
public:
+ X86PartialReduction(const X86TargetMachine *TM) : TM(TM) {}
+ bool run(Function &F);
+
+private:
+ bool tryMAddReplacement(Instruction *Op, bool ReduceInOneBB);
+ bool trySADReplacement(Instruction *Op);
+};
+
+class X86PartialReductionLegacy : public FunctionPass {
+public:
static char ID; // Pass identification, replacement for typeid.
- X86PartialReduction() : FunctionPass(ID) { }
+ X86PartialReductionLegacy() : FunctionPass(ID) {}
- bool runOnFunction(Function &Fn) override;
+ bool runOnFunction(Function &F) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
}
- StringRef getPassName() const override {
- return "X86 Partial Reduction";
- }
-
-private:
- bool tryMAddReplacement(Instruction *Op, bool ReduceInOneBB);
- bool trySADReplacement(Instruction *Op);
+ StringRef getPassName() const override { return "X86 Partial Reduction"; }
};
}
-FunctionPass *llvm::createX86PartialReductionPass() {
- return new X86PartialReduction();
+FunctionPass *llvm::createX86PartialReductionLegacyPass() {
+ return new X86PartialReductionLegacy();
}
-char X86PartialReduction::ID = 0;
+char X86PartialReductionLegacy::ID = 0;
-INITIALIZE_PASS(X86PartialReduction, DEBUG_TYPE,
- "X86 Partial Reduction", false, false)
+INITIALIZE_PASS(X86PartialReductionLegacy, DEBUG_TYPE, "X86 Partial Reduction",
+ false, false)
// This function should be aligned with detectExtMul() in X86ISelLowering.cpp.
static bool matchVPDPBUSDPattern(const X86Subtarget *ST, BinaryOperator *Mul,
@@ -494,17 +501,8 @@ static void collectLeaves(Value *Root, SmallVectorImpl<Instruction *> &Leaves) {
}
}
-bool X86PartialReduction::runOnFunction(Function &F) {
- if (skipFunction(F))
- return false;
-
- auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
- if (!TPC)
- return false;
-
- auto &TM = TPC->getTM<X86TargetMachine>();
- ST = TM.getSubtargetImpl(F);
-
+bool X86PartialReduction::run(Function &F) {
+ ST = TM->getSubtargetImpl(F);
DL = &F.getDataLayout();
bool MadeChange = false;
@@ -540,3 +538,25 @@ bool X86PartialReduction::runOnFunction(Function &F) {
return MadeChange;
}
+
+bool X86PartialReductionLegacy::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+ if (!TPC)
+ return false;
+
+ return X86PartialReduction(&TPC->getTM<X86TargetMachine>()).run(F);
+}
+
+PreservedAnalyses X86PartialReductionPass::run(Function &F,
+ FunctionAnalysisManager &FAM) {
+ bool Changed = X86PartialReduction(TM).run(F);
+ if (!Changed)
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA = PreservedAnalyses::none();
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+}
diff --git a/llvm/lib/Target/X86/X86PassRegistry.def b/llvm/lib/Target/X86/X86PassRegistry.def
index fc25d55..db25594 100644
--- a/llvm/lib/Target/X86/X86PassRegistry.def
+++ b/llvm/lib/Target/X86/X86PassRegistry.def
@@ -15,14 +15,14 @@
#ifndef FUNCTION_PASS
#define FUNCTION_PASS(NAME, CREATE_PASS)
#endif
+FUNCTION_PASS("x86-lower-amx-intrinsics", X86LowerAMXIntrinsicsPass(this))
FUNCTION_PASS("x86-lower-amx-type", X86LowerAMXTypePass(this))
+FUNCTION_PASS("x86-partial-reduction", X86PartialReductionPass(this))
#undef FUNCTION_PASS
#ifndef DUMMY_FUNCTION_PASS
#define DUMMY_FUNCTION_PASS(NAME, CREATE_PASS)
#endif
-DUMMY_FUNCTION_PASS("lower-amx-intrinsics", X86LowerAMXIntrinsics(*this))
-DUMMY_FUNCTION_PASS("x86-partial-reduction", X86PartialReduction())
DUMMY_FUNCTION_PASS("x86-winehstate", WinEHStatePass())
#undef DUMMY_FUNCTION_PASS
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 9a76abc..5f0bcab 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -97,7 +97,7 @@ extern "C" LLVM_C_ABI void LLVMInitializeX86Target() {
initializeX86LoadValueInjectionLoadHardeningPassPass(PR);
initializeX86LoadValueInjectionRetHardeningPassPass(PR);
initializeX86OptimizeLEAPassPass(PR);
- initializeX86PartialReductionPass(PR);
+ initializeX86PartialReductionLegacyPass(PR);
initializePseudoProbeInserterPass(PR);
initializeX86ReturnThunksPass(PR);
initializeX86DAGToDAGISelLegacyPass(PR);
@@ -422,14 +422,14 @@ void X86PassConfig::addIRPasses() {
// We add both pass anyway and when these two passes run, we skip the pass
// based on the option level and option attribute.
- addPass(createX86LowerAMXIntrinsicsPass());
+ addPass(createX86LowerAMXIntrinsicsLegacyPass());
addPass(createX86LowerAMXTypeLegacyPass());
TargetPassConfig::addIRPasses();
if (TM->getOptLevel() != CodeGenOptLevel::None) {
addPass(createInterleavedAccessPass());
- addPass(createX86PartialReductionPass());
+ addPass(createX86PartialReductionLegacyPass());
}
// Add passes that handle indirect branch removal and insertion of a retpoline
diff --git a/llvm/lib/Target/X86/X86VZeroUpper.cpp b/llvm/lib/Target/X86/X86VZeroUpper.cpp
index f6f7e92..2f28ab3 100644
--- a/llvm/lib/Target/X86/X86VZeroUpper.cpp
+++ b/llvm/lib/Target/X86/X86VZeroUpper.cpp
@@ -66,7 +66,7 @@ namespace {
MachineBasicBlock &MBB);
void addDirtySuccessor(MachineBasicBlock &MBB);
- using BlockExitState = enum { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY };
+ enum BlockExitState { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY };
static const char* getBlockExitStateName(BlockExitState ST);
diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCCodeEmitter.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCCodeEmitter.cpp
index bd4d4eb..5977a27 100644
--- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCCodeEmitter.cpp
+++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCCodeEmitter.cpp
@@ -320,7 +320,7 @@ XtensaMCCodeEmitter::getMemRegEncoding(const MCInst &MI, unsigned OpNo,
case Xtensa::SSIP:
case Xtensa::LSI:
case Xtensa::LSIP:
-
+ case Xtensa::S32C1I:
if (Res & 0x3) {
report_fatal_error("Unexpected operand value!");
}
diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp
index 4e73070..8d0fd07 100644
--- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp
+++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp
@@ -202,7 +202,7 @@ bool Xtensa::checkRegister(MCRegister RegNo, const FeatureBitset &FeatureBits,
return FeatureBits[Xtensa::FeatureWindowed];
case Xtensa::ATOMCTL:
case Xtensa::SCOMPARE1:
- return FeatureBits[Xtensa::FeatureWindowed];
+ return FeatureBits[Xtensa::FeatureS32C1I];
case Xtensa::NoRegister:
return false;
}
diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp b/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp
index b0f924f..be69cef 100644
--- a/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp
+++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp
@@ -114,14 +114,31 @@ void XtensaInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
const DebugLoc &DL, Register DestReg,
Register SrcReg, bool KillSrc,
bool RenamableDest, bool RenamableSrc) const {
- // The MOV instruction is not present in core ISA,
+ unsigned Opcode;
+
+ // The MOV instruction is not present in core ISA for AR registers,
// so use OR instruction.
- if (Xtensa::ARRegClass.contains(DestReg, SrcReg))
+ if (Xtensa::ARRegClass.contains(DestReg, SrcReg)) {
BuildMI(MBB, MBBI, DL, get(Xtensa::OR), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc))
.addReg(SrcReg, getKillRegState(KillSrc));
+ return;
+ }
+
+ if (STI.hasSingleFloat() && Xtensa::FPRRegClass.contains(SrcReg) &&
+ Xtensa::FPRRegClass.contains(DestReg))
+ Opcode = Xtensa::MOV_S;
+ else if (STI.hasSingleFloat() && Xtensa::FPRRegClass.contains(SrcReg) &&
+ Xtensa::ARRegClass.contains(DestReg))
+ Opcode = Xtensa::RFR;
+ else if (STI.hasSingleFloat() && Xtensa::ARRegClass.contains(SrcReg) &&
+ Xtensa::FPRRegClass.contains(DestReg))
+ Opcode = Xtensa::WFR;
else
report_fatal_error("Impossible reg-to-reg copy");
+
+ BuildMI(MBB, MBBI, DL, get(Opcode), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
}
void XtensaInstrInfo::storeRegToStackSlot(
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 7a95df4..b575d76 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -1378,8 +1378,7 @@ static bool foldMemChr(CallInst *Call, DomTreeUpdater *DTU,
IRB.CreateTrunc(Call->getArgOperand(1), ByteTy), BBNext, N);
// We can't know the precise weights here, as they would depend on the value
// distribution of Call->getArgOperand(1). So we just mark it as "unknown".
- setExplicitlyUnknownBranchWeightsIfProfiled(*SI, *Call->getFunction(),
- DEBUG_TYPE);
+ setExplicitlyUnknownBranchWeightsIfProfiled(*SI, DEBUG_TYPE);
Type *IndexTy = DL.getIndexType(Call->getType());
SmallVector<DominatorTree::UpdateType, 8> Updates;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index d85e4f7..9bdd8cb 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -479,7 +479,7 @@ private:
const Twine &NameStr = "",
InsertPosition InsertBefore = nullptr) {
auto *Sel = SelectInst::Create(C, S1, S2, NameStr, InsertBefore, nullptr);
- setExplicitlyUnknownBranchWeightsIfProfiled(*Sel, F, DEBUG_TYPE);
+ setExplicitlyUnknownBranchWeightsIfProfiled(*Sel, DEBUG_TYPE, &F);
return Sel;
}
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 67f837c..b158e0f 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2261,11 +2261,11 @@ Instruction *InstCombinerImpl::foldBinopWithPhiOperands(BinaryOperator &BO) {
}
Instruction *InstCombinerImpl::foldBinOpIntoSelectOrPhi(BinaryOperator &I) {
- if (!isa<Constant>(I.getOperand(1)))
- return nullptr;
+ bool IsOtherParamConst = isa<Constant>(I.getOperand(1));
if (auto *Sel = dyn_cast<SelectInst>(I.getOperand(0))) {
- if (Instruction *NewSel = FoldOpIntoSelect(I, Sel))
+ if (Instruction *NewSel =
+ FoldOpIntoSelect(I, Sel, false, !IsOtherParamConst))
return NewSel;
} else if (auto *PN = dyn_cast<PHINode>(I.getOperand(0))) {
if (Instruction *NewPhi = foldOpIntoPhi(I, PN))
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 471c6ec..ceeece4 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -3903,7 +3903,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
// adding/"accumulating" %s. "Accumulation" stores the result in one
// of the source registers, but this accumulate vs. add distinction
// is lost when dealing with LLVM intrinsics.)
+ //
+ // ZeroPurifies means that multiplying a known-zero with an uninitialized
+ // value results in an initialized value. This is applicable for integer
+ // multiplication, but not floating-point (counter-example: NaN).
void handleVectorPmaddIntrinsic(IntrinsicInst &I, unsigned ReductionFactor,
+ bool ZeroPurifies,
unsigned EltSizeInBits = 0) {
IRBuilder<> IRB(&I);
@@ -3945,7 +3950,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
assert(AccumulatorType == ReturnType);
}
- FixedVectorType *ImplicitReturnType = ReturnType;
+ FixedVectorType *ImplicitReturnType =
+ cast<FixedVectorType>(getShadowTy(ReturnType));
// Step 1: instrument multiplication of corresponding vector elements
if (EltSizeInBits) {
ImplicitReturnType = cast<FixedVectorType>(
@@ -3964,30 +3970,40 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
ReturnType->getNumElements() * ReductionFactor);
}
- // Multiplying an *initialized* zero by an uninitialized element results in
- // an initialized zero element.
- //
- // This is analogous to bitwise AND, where "AND" of 0 and a poisoned value
- // results in an unpoisoned value. We can therefore adapt the visitAnd()
- // instrumentation:
- // OutShadow = (SaNonZero & SbNonZero)
- // | (VaNonZero & SbNonZero)
- // | (SaNonZero & VbNonZero)
- // where non-zero is checked on a per-element basis (not per bit).
- Value *SZero = Constant::getNullValue(Va->getType());
- Value *VZero = Constant::getNullValue(Sa->getType());
- Value *SaNonZero = IRB.CreateICmpNE(Sa, SZero);
- Value *SbNonZero = IRB.CreateICmpNE(Sb, SZero);
- Value *VaNonZero = IRB.CreateICmpNE(Va, VZero);
- Value *VbNonZero = IRB.CreateICmpNE(Vb, VZero);
-
- Value *SaAndSbNonZero = IRB.CreateAnd(SaNonZero, SbNonZero);
- Value *VaAndSbNonZero = IRB.CreateAnd(VaNonZero, SbNonZero);
- Value *SaAndVbNonZero = IRB.CreateAnd(SaNonZero, VbNonZero);
-
// Each element of the vector is represented by a single bit (poisoned or
// not) e.g., <8 x i1>.
- Value *And = IRB.CreateOr({SaAndSbNonZero, VaAndSbNonZero, SaAndVbNonZero});
+ Value *SaNonZero = IRB.CreateIsNotNull(Sa);
+ Value *SbNonZero = IRB.CreateIsNotNull(Sb);
+ Value *And;
+ if (ZeroPurifies) {
+ // Multiplying an *initialized* zero by an uninitialized element results
+ // in an initialized zero element.
+ //
+ // This is analogous to bitwise AND, where "AND" of 0 and a poisoned value
+ // results in an unpoisoned value. We can therefore adapt the visitAnd()
+ // instrumentation:
+ // OutShadow = (SaNonZero & SbNonZero)
+ // | (VaNonZero & SbNonZero)
+ // | (SaNonZero & VbNonZero)
+ // where non-zero is checked on a per-element basis (not per bit).
+ Value *VaInt = Va;
+ Value *VbInt = Vb;
+ if (!Va->getType()->isIntegerTy()) {
+ VaInt = CreateAppToShadowCast(IRB, Va);
+ VbInt = CreateAppToShadowCast(IRB, Vb);
+ }
+
+ Value *VaNonZero = IRB.CreateIsNotNull(VaInt);
+ Value *VbNonZero = IRB.CreateIsNotNull(VbInt);
+
+ Value *SaAndSbNonZero = IRB.CreateAnd(SaNonZero, SbNonZero);
+ Value *VaAndSbNonZero = IRB.CreateAnd(VaNonZero, SbNonZero);
+ Value *SaAndVbNonZero = IRB.CreateAnd(SaNonZero, VbNonZero);
+
+ And = IRB.CreateOr({SaAndSbNonZero, VaAndSbNonZero, SaAndVbNonZero});
+ } else {
+ And = IRB.CreateOr({SaNonZero, SbNonZero});
+ }
// Extend <8 x i1> to <8 x i16>.
// (The real pmadd intrinsic would have computed intermediate values of
@@ -5752,17 +5768,20 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
case Intrinsic::x86_avx2_pmadd_ub_sw:
case Intrinsic::x86_avx512_pmaddubs_w_512:
- handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2);
+ handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2,
+ /*ZeroPurifies=*/true);
break;
// <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64>, <1 x i64>)
case Intrinsic::x86_ssse3_pmadd_ub_sw:
- handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/8);
+ handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2,
+ /*ZeroPurifies=*/true, /*EltSizeInBits=*/8);
break;
// <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64>, <1 x i64>)
case Intrinsic::x86_mmx_pmadd_wd:
- handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16);
+ handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2,
+ /*ZeroPurifies=*/true, /*EltSizeInBits=*/16);
break;
// AVX Vector Neural Network Instructions: bytes
@@ -5848,7 +5867,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
case Intrinsic::x86_avx2_vpdpbuuds_128:
case Intrinsic::x86_avx2_vpdpbuuds_256:
case Intrinsic::x86_avx10_vpdpbuuds_512:
- handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/4, /*EltSize=*/8);
+ handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/4,
+ /*ZeroPurifies=*/true, /*EltSizeInBits=*/8);
break;
// AVX Vector Neural Network Instructions: words
@@ -5901,7 +5921,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
case Intrinsic::x86_avx512_vpdpwssds_128:
case Intrinsic::x86_avx512_vpdpwssds_256:
case Intrinsic::x86_avx512_vpdpwssds_512:
- handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16);
+ handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2,
+ /*ZeroPurifies=*/true, /*EltSizeInBits=*/16);
break;
// TODO: Dot Product of BF16 Pairs Accumulated Into Packed Single
diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
index 19eccb9..9ffa602 100644
--- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
@@ -1796,14 +1796,16 @@ private:
// mergeLatch may remove the only block in FC1.
SE.forgetLoop(FC1.L);
SE.forgetLoop(FC0.L);
- // Forget block dispositions as well, so that there are no dangling
- // pointers to erased/free'ed blocks.
- SE.forgetBlockAndLoopDispositions();
// Move instructions from FC0.Latch to FC1.Latch.
// Note: mergeLatch requires an updated DT.
mergeLatch(FC0, FC1);
+ // Forget block dispositions as well, so that there are no dangling
+ // pointers to erased/free'ed blocks. It should be done after mergeLatch()
+ // since merging the latches may affect the dispositions.
+ SE.forgetBlockAndLoopDispositions();
+
// Merge the loops.
SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks());
for (BasicBlock *BB : Blocks) {
@@ -2092,14 +2094,16 @@ private:
// mergeLatch may remove the only block in FC1.
SE.forgetLoop(FC1.L);
SE.forgetLoop(FC0.L);
- // Forget block dispositions as well, so that there are no dangling
- // pointers to erased/free'ed blocks.
- SE.forgetBlockAndLoopDispositions();
// Move instructions from FC0.Latch to FC1.Latch.
// Note: mergeLatch requires an updated DT.
mergeLatch(FC0, FC1);
+ // Forget block dispositions as well, so that there are no dangling
+ // pointers to erased/free'ed blocks. It should be done after mergeLatch()
+ // since merging the latches may affect the dispositions.
+ SE.forgetBlockAndLoopDispositions();
+
// Merge the loops.
SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks());
for (BasicBlock *BB : Blocks) {
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 019536ca..9070d25 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -72,6 +72,7 @@
#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
@@ -105,6 +106,7 @@ STATISTIC(
STATISTIC(NumShiftUntilZero,
"Number of uncountable loops recognized as 'shift until zero' idiom");
+namespace llvm {
bool DisableLIRP::All;
static cl::opt<bool, true>
DisableLIRPAll("disable-" DEBUG_TYPE "-all",
@@ -163,6 +165,10 @@ static cl::opt<bool> ForceMemsetPatternIntrinsic(
cl::desc("Use memset.pattern intrinsic whenever possible"), cl::init(false),
cl::Hidden);
+extern cl::opt<bool> ProfcheckDisableMetadataFixes;
+
+} // namespace llvm
+
namespace {
class LoopIdiomRecognize {
@@ -3199,7 +3205,21 @@ bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
// The loop trip count check.
auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount,
CurLoop->getName() + ".ivcheck");
- Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
+ SmallVector<uint32_t> BranchWeights;
+ const bool HasBranchWeights =
+ !ProfcheckDisableMetadataFixes &&
+ extractBranchWeights(*LoopHeaderBB->getTerminator(), BranchWeights);
+
+ auto *BI = Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
+ if (HasBranchWeights) {
+ if (SuccessorBB == LoopHeaderBB->getTerminator()->getSuccessor(1))
+ std::swap(BranchWeights[0], BranchWeights[1]);
+ // We're not changing the loop profile, so we can reuse the original loop's
+ // profile.
+ setBranchWeights(*BI, BranchWeights,
+ /*IsExpected=*/false);
+ }
+
LoopHeaderBB->getTerminator()->eraseFromParent();
// Populate the IV PHI.
@@ -3368,10 +3388,10 @@ static bool detectShiftUntilZeroIdiom(Loop *CurLoop, ScalarEvolution *SE,
/// %start = <...>
/// %extraoffset = <...>
/// <...>
-/// br label %for.cond
+/// br label %loop
///
/// loop:
-/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
+/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %loop ]
/// %nbits = add nsw i8 %iv, %extraoffset
/// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
/// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
@@ -3533,7 +3553,19 @@ bool LoopIdiomRecognize::recognizeShiftUntilZero() {
// The loop terminator.
Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
- Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
+ SmallVector<uint32_t> BranchWeights;
+ const bool HasBranchWeights =
+ !ProfcheckDisableMetadataFixes &&
+ extractBranchWeights(*LoopHeaderBB->getTerminator(), BranchWeights);
+
+ auto *BI = Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
+ if (HasBranchWeights) {
+ if (InvertedCond)
+ std::swap(BranchWeights[0], BranchWeights[1]);
+ // We're not changing the loop profile, so we can reuse the original loop's
+ // profile.
+ setBranchWeights(*BI, BranchWeights, /*IsExpected=*/false);
+ }
LoopHeaderBB->getTerminator()->eraseFromParent();
// Populate the IV PHI.
diff --git a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
index a883998..1b770be 100644
--- a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -89,8 +89,8 @@ struct StoreToLoadForwardingCandidate {
/// Return true if the dependence from the store to the load has an
/// absolute distance of one.
/// E.g. A[i+1] = A[i] (or A[i-1] = A[i] for descending loop)
- bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE,
- Loop *L) const {
+ bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE, Loop *L,
+ const DominatorTree &DT) const {
Value *LoadPtr = Load->getPointerOperand();
Value *StorePtr = Store->getPointerOperand();
Type *LoadType = getLoadStoreType(Load);
@@ -102,8 +102,10 @@ struct StoreToLoadForwardingCandidate {
DL.getTypeSizeInBits(getLoadStoreType(Store)) &&
"Should be a known dependence");
- int64_t StrideLoad = getPtrStride(PSE, LoadType, LoadPtr, L).value_or(0);
- int64_t StrideStore = getPtrStride(PSE, LoadType, StorePtr, L).value_or(0);
+ int64_t StrideLoad =
+ getPtrStride(PSE, LoadType, LoadPtr, L, DT).value_or(0);
+ int64_t StrideStore =
+ getPtrStride(PSE, LoadType, StorePtr, L, DT).value_or(0);
if (!StrideLoad || !StrideStore || StrideLoad != StrideStore)
return false;
@@ -287,8 +289,8 @@ public:
// so deciding which one forwards is easy. The later one forwards as
// long as they both have a dependence distance of one to the load.
if (Cand.Store->getParent() == OtherCand->Store->getParent() &&
- Cand.isDependenceDistanceOfOne(PSE, L) &&
- OtherCand->isDependenceDistanceOfOne(PSE, L)) {
+ Cand.isDependenceDistanceOfOne(PSE, L, *DT) &&
+ OtherCand->isDependenceDistanceOfOne(PSE, L, *DT)) {
// They are in the same block, the later one will forward to the load.
if (getInstrIndex(OtherCand->Store) < getInstrIndex(Cand.Store))
OtherCand = &Cand;
@@ -538,7 +540,7 @@ public:
// Check whether the SCEV difference is the same as the induction step,
// thus we load the value in the next iteration.
- if (!Cand.isDependenceDistanceOfOne(PSE, L))
+ if (!Cand.isDependenceDistanceOfOne(PSE, L, *DT))
continue;
assert(isa<SCEVAddRecExpr>(PSE.getSCEV(Cand.Load->getPointerOperand())) &&
diff --git a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index b9546c5..e902b71 100644
--- a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -24,6 +24,7 @@
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/LoopPassManager.h"
@@ -393,6 +394,17 @@ private:
DTUpdates.push_back({DominatorTree::Insert, Preheader, BB});
++NumLoopExitsDeleted;
}
+ // We don't really need to add branch weights to DummySwitch, because all
+ // but one branches are just a temporary artifact - see the comment on top
+ // of this function. But, it's easy to estimate the weights, and it helps
+ // maintain a property of the overall compiler - that the branch weights
+ // don't "just get dropped" accidentally (i.e. profcheck)
+ if (DummySwitch->getParent()->getParent()->hasProfileData()) {
+ SmallVector<uint32_t> DummyBranchWeights(1 + DummySwitch->getNumCases());
+ // default. 100% probability, the rest are dead.
+ DummyBranchWeights[0] = 1;
+ setBranchWeights(*DummySwitch, DummyBranchWeights, /*IsExpected=*/false);
+ }
assert(L.getLoopPreheader() == NewPreheader && "Malformed CFG?");
if (Loop *OuterLoop = LI.getLoopFor(Preheader)) {
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 2bda9d8..802ae4e 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -1327,7 +1327,8 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
}
// Do not attempt partial/runtime unrolling in FullLoopUnrolling
- if (OnlyFullUnroll && (UP.Count < TripCount || UP.Count < MaxTripCount)) {
+ if (OnlyFullUnroll && ((!TripCount && !MaxTripCount) ||
+ UP.Count < TripCount || UP.Count < MaxTripCount)) {
LLVM_DEBUG(
dbgs() << "Not attempting partial/runtime unroll in FullLoopUnroll.\n");
return LoopUnrollResult::Unmodified;
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 239526e..0f3e664 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -40,6 +40,7 @@
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/ProfDataUtils.h"
@@ -329,8 +330,7 @@ static void buildPartialUnswitchConditionalBranch(
HasBranchWeights ? ComputeProfFrom.getMetadata(LLVMContext::MD_prof)
: nullptr);
if (!HasBranchWeights)
- setExplicitlyUnknownBranchWeightsIfProfiled(
- *BR, *BR->getParent()->getParent(), DEBUG_TYPE);
+ setExplicitlyUnknownBranchWeightsIfProfiled(*BR, DEBUG_TYPE);
}
/// Copy a set of loop invariant values, and conditionally branch on them.
@@ -388,8 +388,7 @@ static void buildPartialInvariantUnswitchConditionalBranch(
IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc,
Direction ? &NormalSucc : &UnswitchedSucc, ProfData);
if (!ProfData)
- setExplicitlyUnknownBranchWeightsIfProfiled(*BR, *BR->getFunction(),
- DEBUG_TYPE);
+ setExplicitlyUnknownBranchWeightsIfProfiled(*BR, DEBUG_TYPE);
}
/// Rewrite the PHI nodes in an unswitched loop exit basic block.
@@ -2831,9 +2830,14 @@ static BranchInst *turnGuardIntoBranch(IntrinsicInst *GI, Loop &L,
MSSAU->getMemorySSA()->verifyMemorySSA();
DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
- Instruction *DeoptBlockTerm =
- SplitBlockAndInsertIfThen(GI->getArgOperand(0), GI, true,
- GI->getMetadata(LLVMContext::MD_prof), &DTU, &LI);
+ // llvm.experimental.guard doesn't have branch weights. We can assume,
+ // however, that the deopt path is unlikely.
+ Instruction *DeoptBlockTerm = SplitBlockAndInsertIfThen(
+ GI->getArgOperand(0), GI, true,
+ !ProfcheckDisableMetadataFixes && EstimateProfile
+ ? MDBuilder(GI->getContext()).createUnlikelyBranchWeights()
+ : nullptr,
+ &DTU, &LI);
BranchInst *CheckBI = cast<BranchInst>(CheckBB->getTerminator());
// SplitBlockAndInsertIfThen inserts control flow that branches to
// DeoptBlockTerm if the condition is true. We want the opposite.
@@ -3197,10 +3201,14 @@ injectPendingInvariantConditions(NonTrivialUnswitchCandidate Candidate, Loop &L,
Builder.SetInsertPoint(TI);
auto *InvariantBr =
Builder.CreateCondBr(InjectedCond, InLoopSucc, CheckBlock);
+ // We don't know anything about the relation between the limits.
+ setExplicitlyUnknownBranchWeightsIfProfiled(*InvariantBr, DEBUG_TYPE);
Builder.SetInsertPoint(CheckBlock);
- Builder.CreateCondBr(TI->getCondition(), TI->getSuccessor(0),
- TI->getSuccessor(1));
+ Builder.CreateCondBr(
+ TI->getCondition(), TI->getSuccessor(0), TI->getSuccessor(1),
+ !ProfcheckDisableMetadataFixes ? TI->getMetadata(LLVMContext::MD_prof)
+ : nullptr);
TI->eraseFromParent();
// Fixup phis.
diff --git a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
index 573a781..02b73e8 100644
--- a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -1283,6 +1283,12 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
case LibFunc_ilogbl:
case LibFunc_logf:
case LibFunc_logl:
+ case LibFunc_nextafter:
+ case LibFunc_nextafterf:
+ case LibFunc_nextafterl:
+ case LibFunc_nexttoward:
+ case LibFunc_nexttowardf:
+ case LibFunc_nexttowardl:
case LibFunc_pow:
case LibFunc_powf:
case LibFunc_powl:
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 5ba6f95f..6086615 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -933,6 +933,7 @@ Function *CodeExtractor::constructFunctionDeclaration(
case Attribute::CoroDestroyOnlyWhenComplete:
case Attribute::CoroElideSafe:
case Attribute::NoDivergenceSource:
+ case Attribute::NoCreateUndefOrPoison:
continue;
// Those attributes should be safe to propagate to the extracted function.
case Attribute::AlwaysInline:
diff --git a/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp b/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp
index 0642d51..dd8706c 100644
--- a/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp
+++ b/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp
@@ -16,22 +16,62 @@
using namespace llvm;
+static void mergeAttributes(LLVMContext &Ctx, const Module &M,
+ const DataLayout &DL, const Triple &TT,
+ Function *Func, FunctionType *FuncTy,
+ AttributeList FuncAttrs) {
+ AttributeList OldAttrs = Func->getAttributes();
+ AttributeList NewAttrs = OldAttrs;
+
+ {
+ AttrBuilder OldBuilder(Ctx, OldAttrs.getFnAttrs());
+ AttrBuilder NewBuilder(Ctx, FuncAttrs.getFnAttrs());
+ OldBuilder.merge(NewBuilder);
+ NewAttrs = NewAttrs.addFnAttributes(Ctx, OldBuilder);
+ }
+
+ {
+ AttrBuilder OldBuilder(Ctx, OldAttrs.getRetAttrs());
+ AttrBuilder NewBuilder(Ctx, FuncAttrs.getRetAttrs());
+ OldBuilder.merge(NewBuilder);
+ NewAttrs = NewAttrs.addRetAttributes(Ctx, OldBuilder);
+ }
+
+ for (unsigned I = 0, E = FuncTy->getNumParams(); I != E; ++I) {
+ AttrBuilder OldBuilder(Ctx, OldAttrs.getParamAttrs(I));
+ AttrBuilder NewBuilder(Ctx, FuncAttrs.getParamAttrs(I));
+ OldBuilder.merge(NewBuilder);
+ NewAttrs = NewAttrs.addParamAttributes(Ctx, I, OldBuilder);
+ }
+
+ Func->setAttributes(NewAttrs);
+}
+
PreservedAnalyses DeclareRuntimeLibcallsPass::run(Module &M,
ModuleAnalysisManager &MAM) {
RTLIB::RuntimeLibcallsInfo RTLCI(M.getTargetTriple());
LLVMContext &Ctx = M.getContext();
+ const DataLayout &DL = M.getDataLayout();
+ const Triple &TT = M.getTargetTriple();
- for (RTLIB::LibcallImpl Impl : RTLCI.getLibcallImpls()) {
- if (Impl == RTLIB::Unsupported)
+ for (RTLIB::LibcallImpl Impl : RTLIB::libcall_impls()) {
+ if (!RTLCI.isAvailable(Impl))
continue;
- // TODO: Declare with correct type, calling convention, and attributes.
+ auto [FuncTy, FuncAttrs] = RTLCI.getFunctionTy(Ctx, TT, DL, Impl);
- FunctionType *FuncTy =
- FunctionType::get(Type::getVoidTy(Ctx), {}, /*IsVarArgs=*/true);
+ // TODO: Declare with correct type, calling convention, and attributes.
+ if (!FuncTy)
+ FuncTy = FunctionType::get(Type::getVoidTy(Ctx), {}, /*IsVarArgs=*/true);
StringRef FuncName = RTLCI.getLibcallImplName(Impl);
- M.getOrInsertFunction(FuncName, FuncTy);
+
+ Function *Func =
+ cast<Function>(M.getOrInsertFunction(FuncName, FuncTy).getCallee());
+ if (Func->getFunctionType() == FuncTy) {
+ mergeAttributes(Ctx, M, DL, TT, Func, FuncTy, FuncAttrs);
+ Func->setCallingConv(RTLCI.getLibcallImplCallingConv(Impl));
+ }
}
return PreservedAnalyses::none();
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index 1e8f6cc..6c9467b 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -202,6 +202,27 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
/// probability of executing at least one more iteration?
static BranchProbability
probOfNextInRemainder(BranchProbability OriginalLoopProb, unsigned N) {
+ // OriginalLoopProb == 1 would produce a division by zero in the calculation
+ // below. The problem is that case indicates an always infinite loop, but a
+ // remainder loop cannot be calculated at run time if the original loop is
+ // infinite as infinity % UnrollCount is undefined. We then choose
+ // probabilities indicating that all remainder loop iterations will always
+ // execute.
+ //
+ // Currently, the remainder loop here is an epilogue, which cannot be reached
+ // if the original loop is infinite, so the aforementioned choice is
+ // arbitrary.
+ //
+ // FIXME: Branch weights still need to be fixed in the case of prologues
+ // (issue #135812). In that case, the aforementioned choice seems reasonable
+ // for the goal of maintaining the original loop's block frequencies. That
+ // is, an infinite loop's initial iterations are not skipped, and the prologue
+ // loop body might have unique blocks that execute a finite number of times
+ // if, for example, the original loop body contains conditionals like i <
+ // UnrollCount.
+ if (OriginalLoopProb == BranchProbability::getOne())
+ return BranchProbability::getOne();
+
// Each of these variables holds the original loop's probability that the
// number of iterations it will execute is some m in the specified range.
BranchProbability ProbOne = OriginalLoopProb; // 1 <= m
diff --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
index ec2e6c1..9c8b6ef 100644
--- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp
+++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
@@ -23,6 +23,7 @@
#include "llvm/IR/Dominators.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/PassManager.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
@@ -109,8 +110,12 @@ void LoopVersioning::versionLoop(
// Insert the conditional branch based on the result of the memchecks.
Instruction *OrigTerm = RuntimeCheckBB->getTerminator();
Builder.SetInsertPoint(OrigTerm);
- Builder.CreateCondBr(RuntimeCheck, NonVersionedLoop->getLoopPreheader(),
- VersionedLoop->getLoopPreheader());
+ auto *BI =
+ Builder.CreateCondBr(RuntimeCheck, NonVersionedLoop->getLoopPreheader(),
+ VersionedLoop->getLoopPreheader());
+ // We don't know what the probability of executing the versioned vs the
+ // unversioned variants is.
+ setExplicitlyUnknownBranchWeightsIfProfiled(*BI, DEBUG_TYPE);
OrigTerm->eraseFromParent();
// The loops merge in the original exit block. This is now dominated by the
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index cbc604e..37c048f 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -778,8 +778,10 @@ private:
return false;
// Add all values from the range to the set
- for (APInt Tmp = Span.getLower(); Tmp != Span.getUpper(); ++Tmp)
+ APInt Tmp = Span.getLower();
+ do
Vals.push_back(ConstantInt::get(I->getContext(), Tmp));
+ while (++Tmp != Span.getUpper());
UsedICmps++;
return true;
@@ -5212,8 +5214,7 @@ bool SimplifyCFGOpt::simplifyBranchOnICmpChain(BranchInst *BI,
// We don't have any info about this condition.
auto *Br = TrueWhenEqual ? Builder.CreateCondBr(ExtraCase, EdgeBB, NewBB)
: Builder.CreateCondBr(ExtraCase, NewBB, EdgeBB);
- setExplicitlyUnknownBranchWeightsIfProfiled(*Br, *NewBB->getParent(),
- DEBUG_TYPE);
+ setExplicitlyUnknownBranchWeightsIfProfiled(*Br, DEBUG_TYPE);
OldTI->eraseFromParent();
@@ -6020,6 +6021,8 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU,
const DataLayout &DL) {
Value *Cond = SI->getCondition();
KnownBits Known = computeKnownBits(Cond, DL, AC, SI);
+ SmallPtrSet<const Constant *, 4> KnownValues;
+ bool IsKnownValuesValid = collectPossibleValues(Cond, KnownValues, 4);
// We can also eliminate cases by determining that their values are outside of
// the limited range of the condition based on how many significant (non-sign)
@@ -6039,15 +6042,18 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU,
UniqueSuccessors.push_back(Successor);
++It->second;
}
- const APInt &CaseVal = Case.getCaseValue()->getValue();
+ ConstantInt *CaseC = Case.getCaseValue();
+ const APInt &CaseVal = CaseC->getValue();
if (Known.Zero.intersects(CaseVal) || !Known.One.isSubsetOf(CaseVal) ||
- (CaseVal.getSignificantBits() > MaxSignificantBitsInCond)) {
- DeadCases.push_back(Case.getCaseValue());
+ (CaseVal.getSignificantBits() > MaxSignificantBitsInCond) ||
+ (IsKnownValuesValid && !KnownValues.contains(CaseC))) {
+ DeadCases.push_back(CaseC);
if (DTU)
--NumPerSuccessorCases[Successor];
LLVM_DEBUG(dbgs() << "SimplifyCFG: switch case " << CaseVal
<< " is dead.\n");
- }
+ } else if (IsKnownValuesValid)
+ KnownValues.erase(CaseC);
}
// If we can prove that the cases must cover all possible values, the
@@ -6058,33 +6064,41 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU,
const unsigned NumUnknownBits =
Known.getBitWidth() - (Known.Zero | Known.One).popcount();
assert(NumUnknownBits <= Known.getBitWidth());
- if (HasDefault && DeadCases.empty() &&
- NumUnknownBits < 64 /* avoid overflow */) {
- uint64_t AllNumCases = 1ULL << NumUnknownBits;
- if (SI->getNumCases() == AllNumCases) {
+ if (HasDefault && DeadCases.empty()) {
+ if (IsKnownValuesValid && all_of(KnownValues, IsaPred<UndefValue>)) {
createUnreachableSwitchDefault(SI, DTU);
return true;
}
- // When only one case value is missing, replace default with that case.
- // Eliminating the default branch will provide more opportunities for
- // optimization, such as lookup tables.
- if (SI->getNumCases() == AllNumCases - 1) {
- assert(NumUnknownBits > 1 && "Should be canonicalized to a branch");
- IntegerType *CondTy = cast<IntegerType>(Cond->getType());
- if (CondTy->getIntegerBitWidth() > 64 ||
- !DL.fitsInLegalInteger(CondTy->getIntegerBitWidth()))
- return false;
- uint64_t MissingCaseVal = 0;
- for (const auto &Case : SI->cases())
- MissingCaseVal ^= Case.getCaseValue()->getValue().getLimitedValue();
- auto *MissingCase =
- cast<ConstantInt>(ConstantInt::get(Cond->getType(), MissingCaseVal));
- SwitchInstProfUpdateWrapper SIW(*SI);
- SIW.addCase(MissingCase, SI->getDefaultDest(), SIW.getSuccessorWeight(0));
- createUnreachableSwitchDefault(SI, DTU, /*RemoveOrigDefaultBlock*/ false);
- SIW.setSuccessorWeight(0, 0);
- return true;
+ if (NumUnknownBits < 64 /* avoid overflow */) {
+ uint64_t AllNumCases = 1ULL << NumUnknownBits;
+ if (SI->getNumCases() == AllNumCases) {
+ createUnreachableSwitchDefault(SI, DTU);
+ return true;
+ }
+ // When only one case value is missing, replace default with that case.
+ // Eliminating the default branch will provide more opportunities for
+ // optimization, such as lookup tables.
+ if (SI->getNumCases() == AllNumCases - 1) {
+ assert(NumUnknownBits > 1 && "Should be canonicalized to a branch");
+ IntegerType *CondTy = cast<IntegerType>(Cond->getType());
+ if (CondTy->getIntegerBitWidth() > 64 ||
+ !DL.fitsInLegalInteger(CondTy->getIntegerBitWidth()))
+ return false;
+
+ uint64_t MissingCaseVal = 0;
+ for (const auto &Case : SI->cases())
+ MissingCaseVal ^= Case.getCaseValue()->getValue().getLimitedValue();
+ auto *MissingCase = cast<ConstantInt>(
+ ConstantInt::get(Cond->getType(), MissingCaseVal));
+ SwitchInstProfUpdateWrapper SIW(*SI);
+ SIW.addCase(MissingCase, SI->getDefaultDest(),
+ SIW.getSuccessorWeight(0));
+ createUnreachableSwitchDefault(SI, DTU,
+ /*RemoveOrigDefaultBlock*/ false);
+ SIW.setSuccessorWeight(0, 0);
+ return true;
+ }
}
}
@@ -7570,6 +7584,81 @@ static bool reduceSwitchRange(SwitchInst *SI, IRBuilder<> &Builder,
return true;
}
+/// Tries to transform the switch when the condition is umin with a constant.
+/// In that case, the default branch can be replaced by the constant's branch.
+/// This method also removes dead cases when the simplification cannot replace
+/// the default branch.
+///
+/// For example:
+/// switch(umin(a, 3)) {
+/// case 0:
+/// case 1:
+/// case 2:
+/// case 3:
+/// case 4:
+/// // ...
+/// default:
+/// unreachable
+/// }
+///
+/// Transforms into:
+///
+/// switch(a) {
+/// case 0:
+/// case 1:
+/// case 2:
+/// default:
+/// // This is case 3
+/// }
+static bool simplifySwitchWhenUMin(SwitchInst *SI, DomTreeUpdater *DTU) {
+ Value *A;
+ ConstantInt *Constant;
+
+ if (!match(SI->getCondition(), m_UMin(m_Value(A), m_ConstantInt(Constant))))
+ return false;
+
+ SmallVector<DominatorTree::UpdateType> Updates;
+ SwitchInstProfUpdateWrapper SIW(*SI);
+ BasicBlock *BB = SIW->getParent();
+
+ // Dead cases are removed even when the simplification fails.
+ // A case is dead when its value is higher than the Constant.
+ for (auto I = SI->case_begin(), E = SI->case_end(); I != E;) {
+ if (!I->getCaseValue()->getValue().ugt(Constant->getValue())) {
+ ++I;
+ continue;
+ }
+ BasicBlock *DeadCaseBB = I->getCaseSuccessor();
+ DeadCaseBB->removePredecessor(BB);
+ Updates.push_back({DominatorTree::Delete, BB, DeadCaseBB});
+ I = SIW->removeCase(I);
+ E = SIW->case_end();
+ }
+
+ auto Case = SI->findCaseValue(Constant);
+ // If the case value is not found, `findCaseValue` returns the default case.
+ // In this scenario, since there is no explicit `case 3:`, the simplification
+ // fails. The simplification also fails when the switch’s default destination
+ // is reachable.
+ if (!SI->defaultDestUnreachable() || Case == SI->case_default()) {
+ if (DTU)
+ DTU->applyUpdates(Updates);
+ return !Updates.empty();
+ }
+
+ BasicBlock *Unreachable = SI->getDefaultDest();
+ SIW.replaceDefaultDest(Case);
+ SIW.removeCase(Case);
+ SIW->setCondition(A);
+
+ Updates.push_back({DominatorTree::Delete, BB, Unreachable});
+
+ if (DTU)
+ DTU->applyUpdates(Updates);
+
+ return true;
+}
+
/// Tries to transform switch of powers of two to reduce switch range.
/// For example, switch like:
/// switch (C) { case 1: case 2: case 64: case 128: }
@@ -7642,19 +7731,24 @@ static bool simplifySwitchOfPowersOfTwo(SwitchInst *SI, IRBuilder<> &Builder,
// label. The other is those powers of 2 that don't appear in the case
// statement. We don't know the distribution of the values coming in, so
// the safest is to split 50-50 the original probability to `default`.
- uint64_t OrigDenominator = sum_of(map_range(
- Weights, [](const auto &V) { return static_cast<uint64_t>(V); }));
+ uint64_t OrigDenominator =
+ sum_of(map_range(Weights, StaticCastTo<uint64_t>));
SmallVector<uint64_t> NewWeights(2);
NewWeights[1] = Weights[0] / 2;
NewWeights[0] = OrigDenominator - NewWeights[1];
setFittedBranchWeights(*BI, NewWeights, /*IsExpected=*/false);
-
- // For the original switch, we reduce the weight of the default by the
- // amount by which the previous branch contributes to getting to default,
- // and then make sure the remaining weights have the same relative ratio
- // wrt eachother.
+ // The probability of executing the default block stays constant. It was
+ // p_d = Weights[0] / OrigDenominator
+ // we rewrite as W/D
+ // We want to find the probability of the default branch of the switch
+ // statement. Let's call it X. We have W/D = W/2D + X * (1-W/2D)
+ // i.e. the original probability is the probability we go to the default
+ // branch from the BI branch, or we take the default branch on the SI.
+ // Meaning X = W / (2D - W), or (W/2) / (D - W/2)
+ // This matches using W/2 for the default branch probability numerator and
+ // D-W/2 as the denominator.
+ Weights[0] = NewWeights[1];
uint64_t CasesDenominator = OrigDenominator - Weights[0];
- Weights[0] /= 2;
for (auto &W : drop_begin(Weights))
W = NewWeights[0] * static_cast<double>(W) / CasesDenominator;
@@ -8037,6 +8131,9 @@ bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
if (simplifyDuplicateSwitchArms(SI, DTU))
return requestResimplify();
+ if (simplifySwitchWhenUMin(SI, DTU))
+ return requestResimplify();
+
return false;
}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index fdfff16..03112c6 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -462,8 +462,9 @@ int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy,
bool CanAddPredicate = !llvm::shouldOptimizeForSize(
TheLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
- int Stride = getPtrStride(PSE, AccessTy, Ptr, TheLoop, Strides,
- CanAddPredicate, false).value_or(0);
+ int Stride = getPtrStride(PSE, AccessTy, Ptr, TheLoop, *DT, Strides,
+ CanAddPredicate, false)
+ .value_or(0);
if (Stride == 1 || Stride == -1)
return Stride;
return 0;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e5c3f17..906fa2f 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7550,13 +7550,12 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
}
if (LoadInst *Load = dyn_cast<LoadInst>(I))
return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
- Load->getAlign(), VPIRMetadata(*Load, LVer),
- I->getDebugLoc());
+ VPIRMetadata(*Load, LVer), I->getDebugLoc());
StoreInst *Store = cast<StoreInst>(I);
return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
- Reverse, Store->getAlign(),
- VPIRMetadata(*Store, LVer), I->getDebugLoc());
+ Reverse, VPIRMetadata(*Store, LVer),
+ I->getDebugLoc());
}
/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 428a8f4..dd26a05 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -304,18 +304,7 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) {
}
bool IsSingleScalar = vputils::isSingleScalar(Def);
-
VPLane LastLane(IsSingleScalar ? 0 : VF.getFixedValue() - 1);
- // Check if there is a scalar value for the selected lane.
- if (!hasScalarValue(Def, LastLane)) {
- // At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes and
- // VPExpandSCEVRecipes can also be a single scalar.
- assert((isa<VPWidenIntOrFpInductionRecipe, VPScalarIVStepsRecipe,
- VPExpandSCEVRecipe>(Def->getDefiningRecipe())) &&
- "unexpected recipe found to be invariant");
- IsSingleScalar = true;
- LastLane = 0;
- }
// We need to construct the vector value for a single-scalar value by
// broadcasting the scalar to all lanes.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index cfe1f1e..3062e1c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1163,10 +1163,10 @@ public:
bool opcodeMayReadOrWriteFromMemory() const;
/// Returns true if the recipe only uses the first lane of operand \p Op.
- bool onlyFirstLaneUsed(const VPValue *Op) const override;
+ bool usesFirstLaneOnly(const VPValue *Op) const override;
/// Returns true if the recipe only uses the first part of operand \p Op.
- bool onlyFirstPartUsed(const VPValue *Op) const override;
+ bool usesFirstPartOnly(const VPValue *Op) const override;
/// Returns true if this VPInstruction produces a scalar value from a vector,
/// e.g. by performing a reduction or extracting a lane.
@@ -1393,13 +1393,13 @@ public:
return true;
}
- bool onlyFirstPartUsed(const VPValue *Op) const override {
+ bool usesFirstPartOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
return true;
}
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ bool usesFirstLaneOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
return true;
@@ -1628,7 +1628,7 @@ public:
VPSlotTracker &SlotTracker) const override;
#endif
- bool onlyFirstLaneUsed(const VPValue *Op) const override;
+ bool usesFirstLaneOnly(const VPValue *Op) const override;
};
/// A recipe for widening Call instructions using library calls.
@@ -1725,7 +1725,9 @@ public:
#endif
};
-/// A recipe for widening select instructions.
+/// A recipe for widening select instructions. Supports both wide vector and
+/// single-scalar conditions, matching the behavior of LLVM IR's select
+/// instruction.
struct LLVM_ABI_FOR_TEST VPWidenSelectRecipe : public VPRecipeWithIRFlags,
public VPIRMetadata {
VPWidenSelectRecipe(SelectInst &I, ArrayRef<VPValue *> Operands)
@@ -1765,7 +1767,7 @@ struct LLVM_ABI_FOR_TEST VPWidenSelectRecipe : public VPRecipeWithIRFlags,
}
/// Returns true if the recipe only uses the first lane of operand \p Op.
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ bool usesFirstLaneOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
return Op == getCond() && isInvariantCond();
@@ -1831,7 +1833,7 @@ public:
#endif
/// Returns true if the recipe only uses the first lane of operand \p Op.
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ bool usesFirstLaneOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
if (Op == getOperand(0))
@@ -1868,7 +1870,7 @@ public:
void execute(VPTransformState &State) override;
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ bool usesFirstLaneOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
return true;
@@ -1882,7 +1884,7 @@ public:
}
/// Returns true if the recipe only uses the first part of operand \p Op.
- bool onlyFirstPartUsed(const VPValue *Op) const override {
+ bool usesFirstPartOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
assert(getNumOperands() <= 2 && "must have at most two operands");
@@ -1920,14 +1922,14 @@ public:
Type *getSourceElementType() const { return SourceElementTy; }
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ bool usesFirstLaneOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
return true;
}
/// Returns true if the recipe only uses the first part of operand \p Op.
- bool onlyFirstPartUsed(const VPValue *Op) const override {
+ bool usesFirstPartOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
assert(getNumOperands() <= 2 && "must have at most two operands");
@@ -2108,7 +2110,7 @@ public:
}
/// Returns true if the recipe only uses the first lane of operand \p Op.
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ bool usesFirstLaneOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
// The recipe creates its own wide start value, so it only requests the
@@ -2323,7 +2325,7 @@ struct VPFirstOrderRecurrencePHIRecipe : public VPHeaderPHIRecipe {
#endif
/// Returns true if the recipe only uses the first lane of operand \p Op.
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ bool usesFirstLaneOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
return Op == getStartValue();
@@ -2397,7 +2399,7 @@ public:
bool isInLoop() const { return IsInLoop; }
/// Returns true if the recipe only uses the first lane of operand \p Op.
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ bool usesFirstLaneOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
return isOrdered() || isInLoop();
@@ -2466,13 +2468,13 @@ public:
#endif
/// Returns true if the recipe only uses the first lane of operand \p Op.
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ bool usesFirstLaneOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
// Recursing through Blend recipes only, must terminate at header phi's the
// latest.
return all_of(users(),
- [this](VPUser *U) { return U->onlyFirstLaneUsed(this); });
+ [this](VPUser *U) { return U->usesFirstLaneOnly(this); });
}
};
@@ -2560,7 +2562,7 @@ public:
VPCostContext &Ctx) const override;
/// Returns true if the recipe only uses the first lane of operand \p Op.
- bool onlyFirstLaneUsed(const VPValue *Op) const override = 0;
+ bool usesFirstLaneOnly(const VPValue *Op) const override = 0;
/// Returns the number of stored operands of this interleave group. Returns 0
/// for load interleave groups.
@@ -2606,7 +2608,7 @@ public:
VPSlotTracker &SlotTracker) const override;
#endif
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ bool usesFirstLaneOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op);
@@ -2654,7 +2656,7 @@ public:
#endif
/// The recipe only uses the first lane of the address, and EVL operand.
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ bool usesFirstLaneOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
return (Op == getAddr() && !llvm::is_contained(getStoredValues(), Op)) ||
@@ -2860,7 +2862,7 @@ public:
VPValue *getEVL() const { return getOperand(2); }
/// Returns true if the recipe only uses the first lane of operand \p Op.
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ bool usesFirstLaneOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
return Op == getEVL();
@@ -2922,7 +2924,7 @@ public:
bool isPredicated() const { return IsPredicated; }
/// Returns true if the recipe only uses the first lane of operand \p Op.
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ bool usesFirstLaneOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
return isSingleScalar();
@@ -3204,14 +3206,14 @@ protected:
VPWidenMemoryRecipe(const char unsigned SC, Instruction &I,
std::initializer_list<VPValue *> Operands,
- bool Consecutive, bool Reverse, Align Alignment,
+ bool Consecutive, bool Reverse,
const VPIRMetadata &Metadata, DebugLoc DL)
: VPRecipeBase(SC, Operands, DL), VPIRMetadata(Metadata), Ingredient(I),
- Alignment(Alignment), Consecutive(Consecutive), Reverse(Reverse) {
+ Alignment(getLoadStoreAlignment(&I)), Consecutive(Consecutive),
+ Reverse(Reverse) {
assert((Consecutive || !Reverse) && "Reverse implies consecutive");
- assert(isa<VPVectorEndPointerRecipe>(getAddr()) ||
- !Reverse &&
- "Reversed acccess without VPVectorEndPointerRecipe address?");
+ assert((isa<VPVectorEndPointerRecipe>(getAddr()) || !Reverse) &&
+ "Reversed acccess without VPVectorEndPointerRecipe address?");
}
public:
@@ -3271,18 +3273,18 @@ public:
struct LLVM_ABI_FOR_TEST VPWidenLoadRecipe final : public VPWidenMemoryRecipe,
public VPValue {
VPWidenLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask,
- bool Consecutive, bool Reverse, Align Alignment,
+ bool Consecutive, bool Reverse,
const VPIRMetadata &Metadata, DebugLoc DL)
: VPWidenMemoryRecipe(VPDef::VPWidenLoadSC, Load, {Addr}, Consecutive,
- Reverse, Alignment, Metadata, DL),
+ Reverse, Metadata, DL),
VPValue(this, &Load) {
setMask(Mask);
}
VPWidenLoadRecipe *clone() override {
return new VPWidenLoadRecipe(cast<LoadInst>(Ingredient), getAddr(),
- getMask(), Consecutive, Reverse, getAlign(),
- *this, getDebugLoc());
+ getMask(), Consecutive, Reverse, *this,
+ getDebugLoc());
}
VP_CLASSOF_IMPL(VPDef::VPWidenLoadSC);
@@ -3297,7 +3299,7 @@ struct LLVM_ABI_FOR_TEST VPWidenLoadRecipe final : public VPWidenMemoryRecipe,
#endif
/// Returns true if the recipe only uses the first lane of operand \p Op.
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ bool usesFirstLaneOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
// Widened, consecutive loads operations only demand the first lane of
@@ -3313,8 +3315,8 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue *Addr, VPValue &EVL,
VPValue *Mask)
: VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L.getIngredient(),
- {Addr, &EVL}, L.isConsecutive(), L.isReverse(),
- L.getAlign(), L, L.getDebugLoc()),
+ {Addr, &EVL}, L.isConsecutive(), L.isReverse(), L,
+ L.getDebugLoc()),
VPValue(this, &getIngredient()) {
setMask(Mask);
}
@@ -3338,7 +3340,7 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
#endif
/// Returns true if the recipe only uses the first lane of operand \p Op.
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ bool usesFirstLaneOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
// Widened loads only demand the first lane of EVL and consecutive loads
@@ -3352,16 +3354,16 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
struct LLVM_ABI_FOR_TEST VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
VPWidenStoreRecipe(StoreInst &Store, VPValue *Addr, VPValue *StoredVal,
VPValue *Mask, bool Consecutive, bool Reverse,
- Align Alignment, const VPIRMetadata &Metadata, DebugLoc DL)
+ const VPIRMetadata &Metadata, DebugLoc DL)
: VPWidenMemoryRecipe(VPDef::VPWidenStoreSC, Store, {Addr, StoredVal},
- Consecutive, Reverse, Alignment, Metadata, DL) {
+ Consecutive, Reverse, Metadata, DL) {
setMask(Mask);
}
VPWidenStoreRecipe *clone() override {
return new VPWidenStoreRecipe(cast<StoreInst>(Ingredient), getAddr(),
getStoredValue(), getMask(), Consecutive,
- Reverse, getAlign(), *this, getDebugLoc());
+ Reverse, *this, getDebugLoc());
}
VP_CLASSOF_IMPL(VPDef::VPWidenStoreSC);
@@ -3379,7 +3381,7 @@ struct LLVM_ABI_FOR_TEST VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
#endif
/// Returns true if the recipe only uses the first lane of operand \p Op.
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ bool usesFirstLaneOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
// Widened, consecutive stores only demand the first lane of their address,
@@ -3396,7 +3398,7 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
VPValue *Mask)
: VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S.getIngredient(),
{Addr, S.getStoredValue(), &EVL}, S.isConsecutive(),
- S.isReverse(), S.getAlign(), S, S.getDebugLoc()) {
+ S.isReverse(), S, S.getDebugLoc()) {
setMask(Mask);
}
@@ -3422,7 +3424,7 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
#endif
/// Returns true if the recipe only uses the first lane of operand \p Op.
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ bool usesFirstLaneOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
if (Op == getEVL()) {
@@ -3506,14 +3508,14 @@ public:
}
/// Returns true if the recipe only uses the first lane of operand \p Op.
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ bool usesFirstLaneOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
return true;
}
/// Returns true if the recipe only uses the first part of operand \p Op.
- bool onlyFirstPartUsed(const VPValue *Op) const override {
+ bool usesFirstPartOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
return true;
@@ -3588,7 +3590,7 @@ public:
}
/// Returns true if the recipe only uses the first lane of operand \p Op.
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ bool usesFirstLaneOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
return true;
@@ -3698,7 +3700,7 @@ public:
VPValue *getStepValue() const { return getOperand(2); }
/// Returns true if the recipe only uses the first lane of operand \p Op.
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ bool usesFirstLaneOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
return true;
@@ -3763,7 +3765,7 @@ public:
VPValue *getStepValue() const { return getOperand(1); }
/// Returns true if the recipe only uses the first lane of operand \p Op.
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ bool usesFirstLaneOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
return true;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 1ee405a..80cd112 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -659,7 +659,9 @@ Value *VPInstruction::generate(VPTransformState &State) {
}
case Instruction::Select: {
bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
- Value *Cond = State.get(getOperand(0), OnlyFirstLaneUsed);
+ Value *Cond =
+ State.get(getOperand(0),
+ OnlyFirstLaneUsed || vputils::isSingleScalar(getOperand(0)));
Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed);
Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed);
return Builder.CreateSelect(Cond, Op1, Op2, Name);
@@ -1274,7 +1276,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
}
}
-bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
+bool VPInstruction::usesFirstLaneOnly(const VPValue *Op) const {
assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode()))
return vputils::onlyFirstLaneUsed(this);
@@ -1323,7 +1325,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
llvm_unreachable("switch should return");
}
-bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const {
+bool VPInstruction::usesFirstPartOnly(const VPValue *Op) const {
assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
if (Instruction::isBinaryOp(getOpcode()))
return vputils::onlyFirstPartUsed(this);
@@ -1690,7 +1692,7 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {
if (!VFTy->getParamType(I.index())->isVectorTy())
Arg = State.get(I.value(), VPLane(0));
else
- Arg = State.get(I.value(), onlyFirstLaneUsed(I.value()));
+ Arg = State.get(I.value(), usesFirstLaneOnly(I.value()));
Args.push_back(Arg);
}
@@ -1759,7 +1761,7 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
State.TTI))
Arg = State.get(I.value(), VPLane(0));
else
- Arg = State.get(I.value(), onlyFirstLaneUsed(I.value()));
+ Arg = State.get(I.value(), usesFirstLaneOnly(I.value()));
if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index(),
State.TTI))
TysForDecl.push_back(Arg->getType());
@@ -1841,7 +1843,7 @@ StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const {
return Intrinsic::getBaseName(VectorIntrinsicID);
}
-bool VPWidenIntrinsicRecipe::onlyFirstLaneUsed(const VPValue *Op) const {
+bool VPWidenIntrinsicRecipe::usesFirstLaneOnly(const VPValue *Op) const {
assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
return all_of(enumerate(operands()), [this, &Op](const auto &X) {
auto [Idx, V] = X;
@@ -1968,16 +1970,13 @@ void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
getOperand(1)->printAsOperand(O, SlotTracker);
O << ", ";
getOperand(2)->printAsOperand(O, SlotTracker);
- O << (isInvariantCond() ? " (condition is loop invariant)" : "");
+ O << (vputils::isSingleScalar(getCond()) ? " (condition is single-scalar)"
+ : "");
}
#endif
void VPWidenSelectRecipe::execute(VPTransformState &State) {
- // The condition can be loop invariant but still defined inside the
- // loop. This means that we can't just use the original 'cond' value.
- // We have to take the 'vectorized' value and pick the first lane.
- // Instcombine will make this a no-op.
- Value *Cond = State.get(getCond(), isInvariantCond());
+ Value *Cond = State.get(getCond(), vputils::isSingleScalar(getCond()));
Value *Op0 = State.get(getOperand(1));
Value *Op1 = State.get(getOperand(2));
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 9d9bb14..48bd697 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -91,14 +91,13 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
NewRecipe = new VPWidenLoadRecipe(
*Load, Ingredient.getOperand(0), nullptr /*Mask*/,
- false /*Consecutive*/, false /*Reverse*/, Load->getAlign(),
- VPIRMetadata(*Load), Ingredient.getDebugLoc());
+ false /*Consecutive*/, false /*Reverse*/, VPIRMetadata(*Load),
+ Ingredient.getDebugLoc());
} else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
NewRecipe = new VPWidenStoreRecipe(
*Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/,
- Store->getAlign(), VPIRMetadata(*Store),
- Ingredient.getDebugLoc());
+ VPIRMetadata(*Store), Ingredient.getDebugLoc());
} else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
} else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
@@ -154,27 +153,31 @@ static bool sinkScalarOperands(VPlan &Plan) {
bool ScalarVFOnly = Plan.hasScalarVFOnly();
bool Changed = false;
- auto IsValidSinkCandidate = [ScalarVFOnly](VPBasicBlock *SinkTo,
- VPSingleDefRecipe *Candidate) {
- // We only know how to duplicate VPReplicateRecipes and
- // VPScalarIVStepsRecipes for now.
+ SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList;
+ auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
+ VPBasicBlock *SinkTo, VPValue *Op) {
+ auto *Candidate =
+ dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe());
+ if (!Candidate)
+ return;
+
+ // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
+ // for now.
if (!isa<VPReplicateRecipe, VPScalarIVStepsRecipe>(Candidate))
- return false;
+ return;
- if (Candidate->getParent() == SinkTo || Candidate->mayHaveSideEffects() ||
- Candidate->mayReadOrWriteMemory())
- return false;
+ if (Candidate->getParent() == SinkTo || cannotHoistOrSinkRecipe(*Candidate))
+ return;
if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
if (!ScalarVFOnly && RepR->isSingleScalar())
- return false;
+ return;
- return true;
+ WorkList.insert({SinkTo, Candidate});
};
// First, collect the operands of all recipes in replicate blocks as seeds for
// sinking.
- SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList;
for (VPRegionBlock *VPR : VPBlockUtils::blocksOnly<VPRegionBlock>(Iter)) {
VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
@@ -182,14 +185,9 @@ static bool sinkScalarOperands(VPlan &Plan) {
VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());
if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
continue;
- for (auto &Recipe : *VPBB) {
- for (VPValue *Op : Recipe.operands()) {
- if (auto *Def =
- dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
- if (IsValidSinkCandidate(VPBB, Def))
- WorkList.insert({VPBB, Def});
- }
- }
+ for (auto &Recipe : *VPBB)
+ for (VPValue *Op : Recipe.operands())
+ InsertIfValidSinkCandidate(VPBB, Op);
}
// Try to sink each replicate or scalar IV steps recipe in the worklist.
@@ -198,15 +196,15 @@ static bool sinkScalarOperands(VPlan &Plan) {
VPSingleDefRecipe *SinkCandidate;
std::tie(SinkTo, SinkCandidate) = WorkList[I];
- // All recipe users of the sink candidate must be in the same block SinkTo
- // or all users outside of SinkTo must have only their first lane used. In
+ // All recipe users of SinkCandidate must be in the same block SinkTo or all
+ // users outside of SinkTo must only use the first lane of SinkCandidate. In
// the latter case, we need to duplicate SinkCandidate.
auto UsersOutsideSinkTo =
make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {
return cast<VPRecipeBase>(U)->getParent() != SinkTo;
});
if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {
- return !U->onlyFirstLaneUsed(SinkCandidate);
+ return !U->usesFirstLaneOnly(SinkCandidate);
}))
continue;
bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
@@ -234,10 +232,7 @@ static bool sinkScalarOperands(VPlan &Plan) {
}
SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
for (VPValue *Op : SinkCandidate->operands())
- if (auto *Def =
- dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
- if (IsValidSinkCandidate(SinkTo, Def))
- WorkList.insert({SinkTo, Def});
+ InsertIfValidSinkCandidate(SinkTo, Op);
Changed = true;
}
return Changed;
@@ -1290,6 +1285,15 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) {
return;
}
+ // Look through broadcast of single-scalar when used as select conditions; in
+ // that case the scalar condition can be used directly.
+ if (match(Def,
+ m_Select(m_Broadcast(m_VPValue(C)), m_VPValue(), m_VPValue())) &&
+ vputils::isSingleScalar(C)) {
+ Def->setOperand(0, C);
+ return;
+ }
+
if (auto *Phi = dyn_cast<VPPhi>(Def)) {
if (Phi->getNumOperands() == 1)
Phi->replaceAllUsesWith(Phi->getOperand(0));
@@ -4178,6 +4182,59 @@ static bool isAlreadyNarrow(VPValue *VPV) {
return RepR && RepR->isSingleScalar();
}
+// Convert a wide recipe defining a VPValue \p V feeding an interleave group to
+// a narrow variant.
+static VPValue *
+narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl<VPValue *> &NarrowedOps) {
+ auto *R = V->getDefiningRecipe();
+ if (!R || NarrowedOps.contains(V))
+ return V;
+
+ if (isAlreadyNarrow(V))
+ return V;
+
+ if (auto *WideMember0 = dyn_cast<VPWidenRecipe>(R)) {
+ for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
+ WideMember0->setOperand(
+ Idx,
+ narrowInterleaveGroupOp(WideMember0->getOperand(Idx), NarrowedOps));
+ return V;
+ }
+
+ if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
+ // Narrow interleave group to wide load, as transformed VPlan will only
+ // process one original iteration.
+ auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
+ auto *L = new VPWidenLoadRecipe(
+ *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
+ /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
+ L->insertBefore(LoadGroup);
+ NarrowedOps.insert(L);
+ return L;
+ }
+
+ if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
+ assert(RepR->isSingleScalar() &&
+ isa<LoadInst>(RepR->getUnderlyingInstr()) &&
+ "must be a single scalar load");
+ NarrowedOps.insert(RepR);
+ return RepR;
+ }
+
+ auto *WideLoad = cast<VPWidenLoadRecipe>(R);
+ VPValue *PtrOp = WideLoad->getAddr();
+ if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
+ PtrOp = VecPtr->getOperand(0);
+ // Narrow wide load to uniform scalar load, as transformed VPlan will only
+ // process one original iteration.
+ auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
+ /*IsUniform*/ true,
+ /*Mask*/ nullptr, *WideLoad);
+ N->insertBefore(WideLoad);
+ NarrowedOps.insert(N);
+ return N;
+}
+
void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
unsigned VectorRegWidth) {
VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
@@ -4279,65 +4336,15 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
// Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
SmallPtrSet<VPValue *, 4> NarrowedOps;
- auto NarrowOp = [&NarrowedOps](VPValue *V) -> VPValue * {
- auto *R = V->getDefiningRecipe();
- if (!R || NarrowedOps.contains(V))
- return V;
- if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
- // Narrow interleave group to wide load, as transformed VPlan will only
- // process one original iteration.
- auto *LI =
- cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
- auto *L = new VPWidenLoadRecipe(
- *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
- /*Reverse=*/false, LI->getAlign(), {}, LoadGroup->getDebugLoc());
- L->insertBefore(LoadGroup);
- NarrowedOps.insert(L);
- return L;
- }
-
- if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
- assert(RepR->isSingleScalar() &&
- isa<LoadInst>(RepR->getUnderlyingInstr()) &&
- "must be a single scalar load");
- NarrowedOps.insert(RepR);
- return RepR;
- }
- auto *WideLoad = cast<VPWidenLoadRecipe>(R);
-
- VPValue *PtrOp = WideLoad->getAddr();
- if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
- PtrOp = VecPtr->getOperand(0);
- // Narrow wide load to uniform scalar load, as transformed VPlan will only
- // process one original iteration.
- auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
- /*IsUniform*/ true,
- /*Mask*/ nullptr, *WideLoad);
- N->insertBefore(WideLoad);
- NarrowedOps.insert(N);
- return N;
- };
-
// Narrow operation tree rooted at store groups.
for (auto *StoreGroup : StoreGroups) {
- VPValue *Res = nullptr;
- VPValue *Member0 = StoreGroup->getStoredValues()[0];
- if (isAlreadyNarrow(Member0)) {
- Res = Member0;
- } else if (auto *WideMember0 =
- dyn_cast<VPWidenRecipe>(Member0->getDefiningRecipe())) {
- for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
- WideMember0->setOperand(Idx, NarrowOp(WideMember0->getOperand(Idx)));
- Res = WideMember0;
- } else {
- Res = NarrowOp(Member0);
- }
-
+ VPValue *Res =
+ narrowInterleaveGroupOp(StoreGroup->getStoredValues()[0], NarrowedOps);
auto *SI =
cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
auto *S = new VPWidenStoreRecipe(
*SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
- /*Reverse=*/false, SI->getAlign(), {}, StoreGroup->getDebugLoc());
+ /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
S->insertBefore(StoreGroup);
StoreGroup->eraseFromParent();
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index d6a0028..d4b8b72b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -582,7 +582,7 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
/// Users that only demand the first lane can use the definition for lane
/// 0.
DefR->replaceUsesWithIf(LaneDefs[0], [DefR](VPUser &U, unsigned) {
- return U.onlyFirstLaneUsed(DefR);
+ return U.usesFirstLaneOnly(DefR);
});
// Update each build vector user that currently has DefR as its only
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index c6380d3..e22c5df 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -18,12 +18,12 @@ using namespace llvm::VPlanPatternMatch;
bool vputils::onlyFirstLaneUsed(const VPValue *Def) {
return all_of(Def->users(),
- [Def](const VPUser *U) { return U->onlyFirstLaneUsed(Def); });
+ [Def](const VPUser *U) { return U->usesFirstLaneOnly(Def); });
}
bool vputils::onlyFirstPartUsed(const VPValue *Def) {
return all_of(Def->users(),
- [Def](const VPUser *U) { return U->onlyFirstPartUsed(Def); });
+ [Def](const VPUser *U) { return U->usesFirstPartOnly(Def); });
}
bool vputils::onlyScalarValuesUsed(const VPValue *Def) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 83e3fca..5da7463 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -274,12 +274,12 @@ public:
virtual bool usesScalars(const VPValue *Op) const {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
- return onlyFirstLaneUsed(Op);
+ return usesFirstLaneOnly(Op);
}
/// Returns true if the VPUser only uses the first lane of operand \p Op.
/// Conservatively returns false.
- virtual bool onlyFirstLaneUsed(const VPValue *Op) const {
+ virtual bool usesFirstLaneOnly(const VPValue *Op) const {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
return false;
@@ -287,7 +287,7 @@ public:
/// Returns true if the VPUser only uses the first part of operand \p Op.
/// Conservatively returns false.
- virtual bool onlyFirstPartUsed(const VPValue *Op) const {
+ virtual bool usesFirstPartOnly(const VPValue *Op) const {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
return false;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 91734a1..34754a1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -252,6 +252,13 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) {
for (const VPUser *U : V->users()) {
auto *UI = cast<VPRecipeBase>(U);
+ if (isa<VPIRPhi>(UI) &&
+ UI->getNumOperands() != UI->getParent()->getNumPredecessors()) {
+ errs() << "Phi-like recipe with different number of operands and "
+ "predecessors.\n";
+ return false;
+ }
+
if (auto *Phi = dyn_cast<VPPhiAccessors>(UI)) {
for (const auto &[IncomingVPV, IncomingVPBB] :
Phi->incoming_values_and_blocks()) {
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index d6eb00d..27a8bbd 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -2017,8 +2017,31 @@ bool VectorCombine::scalarizeExtExtract(Instruction &I) {
Value *ScalarV = Ext->getOperand(0);
if (!isGuaranteedNotToBePoison(ScalarV, &AC, dyn_cast<Instruction>(ScalarV),
- &DT))
- ScalarV = Builder.CreateFreeze(ScalarV);
+ &DT)) {
+ // Check wether all lanes are extracted, all extracts trigger UB
+ // on poison, and the last extract (and hence all previous ones)
+ // are guaranteed to execute if Ext executes. If so, we do not
+ // need to insert a freeze.
+ SmallDenseSet<ConstantInt *, 8> ExtractedLanes;
+ bool AllExtractsTriggerUB = true;
+ ExtractElementInst *LastExtract = nullptr;
+ BasicBlock *ExtBB = Ext->getParent();
+ for (User *U : Ext->users()) {
+ auto *Extract = cast<ExtractElementInst>(U);
+ if (Extract->getParent() != ExtBB || !programUndefinedIfPoison(Extract)) {
+ AllExtractsTriggerUB = false;
+ break;
+ }
+ ExtractedLanes.insert(cast<ConstantInt>(Extract->getIndexOperand()));
+ if (!LastExtract || LastExtract->comesBefore(Extract))
+ LastExtract = Extract;
+ }
+ if (ExtractedLanes.size() != DstTy->getNumElements() ||
+ !AllExtractsTriggerUB ||
+ !isGuaranteedToTransferExecutionToSuccessor(Ext->getIterator(),
+ LastExtract->getIterator()))
+ ScalarV = Builder.CreateFreeze(ScalarV);
+ }
ScalarV = Builder.CreateBitCast(
ScalarV,
IntegerType::get(SrcTy->getContext(), DL->getTypeSizeInBits(SrcTy)));