aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Analysis/ConstantFolding.cpp20
-rw-r--r--llvm/lib/Analysis/DXILResource.cpp7
-rw-r--r--llvm/lib/Analysis/DependenceAnalysis.cpp17
-rw-r--r--llvm/lib/Analysis/MemoryDependenceAnalysis.cpp44
-rw-r--r--llvm/lib/Analysis/ProfileSummaryInfo.cpp14
-rw-r--r--llvm/lib/Analysis/TargetLibraryInfo.cpp28
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/AIXException.cpp4
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/ARMException.cpp2
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp9
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp4
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp6
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp2
-rw-r--r--llvm/lib/CodeGen/CodeGen.cpp1
-rw-r--r--llvm/lib/CodeGen/CodeGenPrepare.cpp4
-rw-r--r--llvm/lib/CodeGen/ExpandFp.cpp33
-rw-r--r--llvm/lib/CodeGen/InterleavedAccessPass.cpp123
-rw-r--r--llvm/lib/CodeGen/MIRPrinter.cpp3
-rw-r--r--llvm/lib/CodeGen/MachineInstrBundle.cpp21
-rw-r--r--llvm/lib/CodeGen/MachineLICM.cpp1
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp54
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h4
-rw-r--r--llvm/lib/CodeGen/StackProtector.cpp10
-rw-r--r--llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp19
-rw-r--r--llvm/lib/DebugInfo/DWARF/DWARFCFIPrinter.cpp3
-rw-r--r--llvm/lib/DebugInfo/DWARF/LowLevel/DWARFExpression.cpp2
-rw-r--r--llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp21
-rw-r--r--llvm/lib/FileCheck/FileCheck.cpp6
-rw-r--r--llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp474
-rw-r--r--llvm/lib/IR/AutoUpgrade.cpp7
-rw-r--r--llvm/lib/IR/DiagnosticInfo.cpp4
-rw-r--r--llvm/lib/IR/Type.cpp18
-rw-r--r--llvm/lib/MC/CMakeLists.txt6
-rw-r--r--llvm/lib/MC/ELFObjectWriter.cpp29
-rw-r--r--llvm/lib/MC/GOFFObjectWriter.cpp2
-rw-r--r--llvm/lib/MC/MCAsmInfoCOFF.cpp108
-rw-r--r--llvm/lib/MC/MCAsmInfoDarwin.cpp5
-rw-r--r--llvm/lib/MC/MCAsmInfoELF.cpp196
-rw-r--r--llvm/lib/MC/MCAsmInfoGOFF.cpp138
-rw-r--r--llvm/lib/MC/MCAsmInfoWasm.cpp83
-rw-r--r--llvm/lib/MC/MCAsmInfoXCOFF.cpp124
-rw-r--r--llvm/lib/MC/MCAsmStreamer.cpp31
-rw-r--r--llvm/lib/MC/MCAssembler.cpp2
-rw-r--r--llvm/lib/MC/MCContext.cpp54
-rw-r--r--llvm/lib/MC/MCELFStreamer.cpp4
-rw-r--r--llvm/lib/MC/MCFragment.cpp2
-rw-r--r--llvm/lib/MC/MCGOFFStreamer.cpp20
-rw-r--r--llvm/lib/MC/MCMachOStreamer.cpp7
-rw-r--r--llvm/lib/MC/MCObjectStreamer.cpp36
-rw-r--r--llvm/lib/MC/MCParser/AsmParser.cpp11
-rw-r--r--llvm/lib/MC/MCParser/ELFAsmParser.cpp4
-rw-r--r--llvm/lib/MC/MCParser/MasmParser.cpp3
-rw-r--r--llvm/lib/MC/MCParser/WasmAsmParser.cpp2
-rw-r--r--llvm/lib/MC/MCSection.cpp8
-rw-r--r--llvm/lib/MC/MCSectionCOFF.cpp117
-rw-r--r--llvm/lib/MC/MCSectionDXContainer.cpp15
-rw-r--r--llvm/lib/MC/MCSectionELF.cpp217
-rw-r--r--llvm/lib/MC/MCSectionGOFF.cpp143
-rw-r--r--llvm/lib/MC/MCSectionMachO.cpp28
-rw-r--r--llvm/lib/MC/MCSectionWasm.cpp101
-rw-r--r--llvm/lib/MC/MCSectionXCOFF.cpp134
-rw-r--r--llvm/lib/MC/MCStreamer.cpp38
-rw-r--r--llvm/lib/MC/MCTargetOptions.cpp3
-rw-r--r--llvm/lib/MC/MCTargetOptionsCommandFlags.cpp7
-rw-r--r--llvm/lib/MC/MCWasmStreamer.cpp2
-rw-r--r--llvm/lib/MC/MCWinCOFFStreamer.cpp3
-rw-r--r--llvm/lib/MC/MCXCOFFStreamer.cpp14
-rw-r--r--llvm/lib/MC/MachObjectWriter.cpp32
-rw-r--r--llvm/lib/MC/WasmObjectWriter.cpp2
-rw-r--r--llvm/lib/MC/WinCOFFObjectWriter.cpp5
-rw-r--r--llvm/lib/MC/XCOFFObjectWriter.cpp8
-rw-r--r--llvm/lib/Passes/PassBuilder.cpp1
-rw-r--r--llvm/lib/Passes/PassRegistry.def3
-rw-r--r--llvm/lib/ProfileData/InstrProfReader.cpp2
-rw-r--r--llvm/lib/Support/AArch64AttributeParser.cpp27
-rw-r--r--llvm/lib/Support/CommandLine.cpp27
-rw-r--r--llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp38
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp163
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.h7
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.cpp270
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.h4
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.td69
-rw-r--r--llvm/lib/Target/AArch64/AArch64RegisterInfo.td3
-rw-r--r--llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp23
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp2
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp3
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp2
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.td55
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGISel.td6
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp61
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp151
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h12
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp21
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h6
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp317
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp239
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp25
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h6
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp31
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h3
-rw-r--r--llvm/lib/Target/AMDGPU/FLATInstructions.td176
-rw-r--r--llvm/lib/Target/AMDGPU/GCNRegPressure.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/GCNRegPressure.h40
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp11
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.h21
-rw-r--r--llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/R600TargetMachine.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/SIDefines.h12
-rw-r--r--llvm/lib/Target/AMDGPU/SIFoldOperands.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp69
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp61
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h3
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.td8
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td10
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp14
-rw-r--r--llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp36
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.td43
-rw-r--r--llvm/lib/Target/AMDGPU/SMInstructions.td5
-rw-r--r--llvm/lib/Target/AMDGPU/VOP2Instructions.td17
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3PInstructions.td177
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp12
-rw-r--r--llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp4
-rw-r--r--llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp14
-rw-r--r--llvm/lib/Target/AVR/AVRAsmPrinter.cpp2
-rw-r--r--llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp2
-rw-r--r--llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h4
-rw-r--r--llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp2
-rw-r--r--llvm/lib/Target/BPF/BTFDebug.cpp6
-rw-r--r--llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp7
-rw-r--r--llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp1
-rw-r--r--llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h5
-rw-r--r--llvm/lib/Target/DirectX/DXILDataScalarization.cpp46
-rw-r--r--llvm/lib/Target/DirectX/DXILFlattenArrays.cpp9
-rw-r--r--llvm/lib/Target/DirectX/DXILLegalizePass.cpp97
-rw-r--r--llvm/lib/Target/DirectX/DXILResourceAccess.cpp3
-rw-r--r--llvm/lib/Target/DirectX/DXILRootSignature.cpp471
-rw-r--r--llvm/lib/Target/DirectX/DXILRootSignature.h11
-rw-r--r--llvm/lib/Target/DirectX/DXILShaderFlags.cpp15
-rw-r--r--llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp1
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp31
-rw-r--r--llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp2
-rw-r--r--llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp2
-rw-r--r--llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h2
-rw-r--r--llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp5
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp7
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXInstrInfo.td6
-rw-r--r--llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp2
-rw-r--r--llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h2
-rw-r--r--llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp2
-rw-r--r--llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp32
-rw-r--r--llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp9
-rw-r--r--llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp111
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h1
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp6
-rw-r--r--llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp2
-rw-r--r--llvm/lib/Target/RISCV/RISCVFeatures.td6
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp62
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.h3
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfo.cpp3
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoC.td121
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoP.td54
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td95
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td33
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoZc.td14
-rw-r--r--llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp15
-rw-r--r--llvm/lib/Target/RISCV/RISCVRegisterInfo.td5
-rw-r--r--llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td200
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetMachine.cpp8
-rw-r--r--llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp35
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVAPI.cpp4
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp6
-rw-r--r--llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp1
-rw-r--r--llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp3
-rw-r--r--llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.cpp4
-rw-r--r--llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp6
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssembly.td15
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp18
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp2
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td3
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td11
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp6
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp5
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h2
-rw-r--r--llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp30
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp40
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp1
-rw-r--r--llvm/lib/Target/X86/X86PassRegistry.def44
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.cpp131
-rw-r--r--llvm/lib/TargetParser/TargetParser.cpp1
-rw-r--r--llvm/lib/TargetParser/Triple.cpp1
-rw-r--r--llvm/lib/TextAPI/SymbolSet.cpp5
-rw-r--r--llvm/lib/Transforms/Coroutines/Coroutines.cpp3
-rw-r--r--llvm/lib/Transforms/HipStdPar/HipStdPar.cpp220
-rw-r--r--llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp10
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp6
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp24
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp4
-rw-r--r--llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp17
-rw-r--r--llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp193
-rw-r--r--llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp1
-rw-r--r--llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h16
-rw-r--r--llvm/lib/Transforms/ObjCARC/CMakeLists.txt1
-rw-r--r--llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp156
-rw-r--r--llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp192
-rw-r--r--llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp1
-rw-r--r--llvm/lib/Transforms/Scalar/LoopInterchange.cpp114
-rw-r--r--llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp2
-rw-r--r--llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp12
-rw-r--r--llvm/lib/Transforms/Scalar/Scalarizer.cpp4
-rw-r--r--llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp49
-rw-r--r--llvm/lib/Transforms/Scalar/StructurizeCFG.cpp130
-rw-r--r--llvm/lib/Transforms/Utils/CMakeLists.txt1
-rw-r--r--llvm/lib/Transforms/Utils/Debugify.cpp1
-rw-r--r--llvm/lib/Transforms/Utils/LCSSA.cpp1
-rw-r--r--llvm/lib/Transforms/Utils/Local.cpp15
-rw-r--r--llvm/lib/Transforms/Utils/PredicateInfo.cpp1
-rw-r--r--llvm/lib/Transforms/Utils/ProfileVerify.cpp129
-rw-r--r--llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp36
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp27
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp703
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.cpp6
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.h13
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp8
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp5
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp48
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp51
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.h10
-rw-r--r--llvm/lib/Transforms/Vectorize/VectorCombine.cpp124
234 files changed, 5849 insertions, 3435 deletions
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index e71ba5e..759c553 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -929,12 +929,11 @@ Constant *SymbolicallyEvaluateGEP(const GEPOperator *GEP,
if (!AllConstantInt)
break;
- // TODO: Try to intersect two inrange attributes?
- if (!InRange) {
- InRange = GEP->getInRange();
- if (InRange)
- // Adjust inrange by offset until now.
- InRange = InRange->sextOrTrunc(BitWidth).subtract(Offset);
+ // Adjust inrange offset and intersect inrange attributes
+ if (auto GEPRange = GEP->getInRange()) {
+ auto AdjustedGEPRange = GEPRange->sextOrTrunc(BitWidth).subtract(Offset);
+ InRange =
+ InRange ? InRange->intersectWith(AdjustedGEPRange) : AdjustedGEPRange;
}
Ptr = cast<Constant>(GEP->getOperand(0));
@@ -2004,21 +2003,20 @@ inline bool llvm_fenv_testexcept() {
return false;
}
-static const APFloat FTZPreserveSign(const APFloat &V) {
+static APFloat FTZPreserveSign(const APFloat &V) {
if (V.isDenormal())
return APFloat::getZero(V.getSemantics(), V.isNegative());
return V;
}
-static const APFloat FlushToPositiveZero(const APFloat &V) {
+static APFloat FlushToPositiveZero(const APFloat &V) {
if (V.isDenormal())
return APFloat::getZero(V.getSemantics(), false);
return V;
}
-static const APFloat
-FlushWithDenormKind(const APFloat &V,
- DenormalMode::DenormalModeKind DenormKind) {
+static APFloat FlushWithDenormKind(const APFloat &V,
+ DenormalMode::DenormalModeKind DenormKind) {
assert(DenormKind != DenormalMode::DenormalModeKind::Invalid &&
DenormKind != DenormalMode::DenormalModeKind::Dynamic);
switch (DenormKind) {
diff --git a/llvm/lib/Analysis/DXILResource.cpp b/llvm/lib/Analysis/DXILResource.cpp
index 2da6468..1959ab6 100644
--- a/llvm/lib/Analysis/DXILResource.cpp
+++ b/llvm/lib/Analysis/DXILResource.cpp
@@ -1079,15 +1079,16 @@ void DXILResourceBindingInfo::populate(Module &M, DXILResourceTypeMap &DRTM) {
// add new space
S = &BS->Spaces.emplace_back(B.Space);
- // the space is full - set flag to report overlapping binding later
- if (S->FreeRanges.empty()) {
+ // The space is full - there are no free slots left, or the rest of the
+ // slots are taken by an unbounded array. Set flag to report overlapping
+ // binding later.
+ if (S->FreeRanges.empty() || S->FreeRanges.back().UpperBound < UINT32_MAX) {
OverlappingBinding = true;
continue;
}
// adjust the last free range lower bound, split it in two, or remove it
BindingRange &LastFreeRange = S->FreeRanges.back();
- assert(LastFreeRange.UpperBound == UINT32_MAX);
if (LastFreeRange.LowerBound == B.LowerBound) {
if (B.UpperBound < UINT32_MAX)
LastFreeRange.LowerBound = B.UpperBound + 1;
diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp
index dd9a44b..f1473b2 100644
--- a/llvm/lib/Analysis/DependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/DependenceAnalysis.cpp
@@ -3383,6 +3383,10 @@ bool DependenceInfo::tryDelinearize(Instruction *Src, Instruction *Dst,
SrcSubscripts, DstSubscripts))
return false;
+ assert(isLoopInvariant(SrcBase, SrcLoop) &&
+ isLoopInvariant(DstBase, DstLoop) &&
+ "Expected SrcBase and DstBase to be loop invariant");
+
int Size = SrcSubscripts.size();
LLVM_DEBUG({
dbgs() << "\nSrcSubscripts: ";
@@ -3666,6 +3670,19 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
SCEVUnionPredicate(Assume, *SE));
}
+ // Even if the base pointers are the same, they may not be loop-invariant. It
+ // could lead to incorrect results, as we're analyzing loop-carried
+ // dependencies. Src and Dst can be in different loops, so we need to check
+ // the base pointer is invariant in both loops.
+ Loop *SrcLoop = LI->getLoopFor(Src->getParent());
+ Loop *DstLoop = LI->getLoopFor(Dst->getParent());
+ if (!isLoopInvariant(SrcBase, SrcLoop) ||
+ !isLoopInvariant(DstBase, DstLoop)) {
+ LLVM_DEBUG(dbgs() << "The base pointer is not loop invariant.\n");
+ return std::make_unique<Dependence>(Src, Dst,
+ SCEVUnionPredicate(Assume, *SE));
+ }
+
uint64_t EltSize = SrcLoc.Size.toRaw();
const SCEV *SrcEv = SE->getMinusSCEV(SrcSCEV, SrcBase);
const SCEV *DstEv = SE->getMinusSCEV(DstSCEV, DstBase);
diff --git a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
index 3aa9909..2b0f212 100644
--- a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -983,33 +983,37 @@ MemDepResult MemoryDependenceResults::getNonLocalInfoForBlock(
static void
SortNonLocalDepInfoCache(MemoryDependenceResults::NonLocalDepInfo &Cache,
unsigned NumSortedEntries) {
- switch (Cache.size() - NumSortedEntries) {
- case 0:
- // done, no new entries.
- break;
- case 2: {
- // Two new entries, insert the last one into place.
- NonLocalDepEntry Val = Cache.back();
- Cache.pop_back();
- MemoryDependenceResults::NonLocalDepInfo::iterator Entry =
- std::upper_bound(Cache.begin(), Cache.end() - 1, Val);
- Cache.insert(Entry, Val);
- [[fallthrough]];
+
+ // If only one entry, don't sort.
+ if (Cache.size() < 2)
+ return;
+
+ unsigned s = Cache.size() - NumSortedEntries;
+
+ // If the cache is already sorted, don't sort it again.
+ if (s == 0)
+ return;
+
+ // If no entry is sorted, sort the whole cache.
+ if (NumSortedEntries == 0) {
+ llvm::sort(Cache);
+ return;
}
- case 1:
- // One new entry, Just insert the new value at the appropriate position.
- if (Cache.size() != 1) {
+
+ // If the number of unsorted entires is small and the cache size is big, using
+ // insertion sort is faster. Here use Log2_32 to quickly choose the sort
+ // method.
+ if (s < Log2_32(Cache.size())) {
+ while (s > 0) {
NonLocalDepEntry Val = Cache.back();
Cache.pop_back();
MemoryDependenceResults::NonLocalDepInfo::iterator Entry =
- llvm::upper_bound(Cache, Val);
+ std::upper_bound(Cache.begin(), Cache.end() - s + 1, Val);
Cache.insert(Entry, Val);
+ s--;
}
- break;
- default:
- // Added many values, do a full scale sort.
+ } else {
llvm::sort(Cache);
- break;
}
}
diff --git a/llvm/lib/Analysis/ProfileSummaryInfo.cpp b/llvm/lib/Analysis/ProfileSummaryInfo.cpp
index e8d4e37..f1c3155 100644
--- a/llvm/lib/Analysis/ProfileSummaryInfo.cpp
+++ b/llvm/lib/Analysis/ProfileSummaryInfo.cpp
@@ -121,8 +121,18 @@ void ProfileSummaryInfo::computeThresholds() {
ProfileSummaryBuilder::getHotCountThreshold(DetailedSummary);
ColdCountThreshold =
ProfileSummaryBuilder::getColdCountThreshold(DetailedSummary);
- assert(ColdCountThreshold <= HotCountThreshold &&
- "Cold count threshold cannot exceed hot count threshold!");
+ // When the hot and cold thresholds are identical, we would classify
+ // a count value as both hot and cold since we are doing an inclusive check
+ // (see ::is{Hot|Cold}Count(). To avoid this undesirable overlap, ensure the
+ // thresholds are distinct.
+ if (HotCountThreshold == ColdCountThreshold) {
+ if (ColdCountThreshold > 0)
+ (*ColdCountThreshold)--;
+ else
+ (*HotCountThreshold)++;
+ }
+ assert(ColdCountThreshold < HotCountThreshold &&
+ "Cold count threshold should be less than hot count threshold!");
if (!hasPartialSampleProfile() || !ScalePartialSampleProfileWorkingSetSize) {
HasHugeWorkingSetSize =
HotEntry.NumCounts > ProfileSummaryHugeWorkingSetSizeThreshold;
diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp
index e475be2..6e92766 100644
--- a/llvm/lib/Analysis/TargetLibraryInfo.cpp
+++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp
@@ -875,6 +875,34 @@ static void initializeLibCalls(TargetLibraryInfoImpl &TLI, const Triple &T,
TLI.setUnavailable(LibFunc_toascii);
}
+ if (T.isOSFreeBSD()) {
+ TLI.setAvailable(LibFunc_dunder_strtok_r);
+ TLI.setAvailable(LibFunc_memalign);
+ TLI.setAvailable(LibFunc_fputc_unlocked);
+ TLI.setAvailable(LibFunc_fputs_unlocked);
+ TLI.setAvailable(LibFunc_fread_unlocked);
+ TLI.setAvailable(LibFunc_fwrite_unlocked);
+ TLI.setAvailable(LibFunc_getc_unlocked);
+ TLI.setAvailable(LibFunc_getchar_unlocked);
+ TLI.setAvailable(LibFunc_putc_unlocked);
+ TLI.setAvailable(LibFunc_putchar_unlocked);
+
+ TLI.setUnavailable(LibFunc___kmpc_alloc_shared);
+ TLI.setUnavailable(LibFunc___kmpc_free_shared);
+ TLI.setUnavailable(LibFunc_dunder_strndup);
+ TLI.setUnavailable(LibFunc_memccpy_chk);
+ TLI.setUnavailable(LibFunc_strlen_chk);
+ TLI.setUnavailable(LibFunc_fmaximum_num);
+ TLI.setUnavailable(LibFunc_fmaximum_numf);
+ TLI.setUnavailable(LibFunc_fmaximum_numl);
+ TLI.setUnavailable(LibFunc_fminimum_num);
+ TLI.setUnavailable(LibFunc_fminimum_numf);
+ TLI.setUnavailable(LibFunc_fminimum_numl);
+ TLI.setUnavailable(LibFunc_roundeven);
+ TLI.setUnavailable(LibFunc_roundevenf);
+ TLI.setUnavailable(LibFunc_roundevenl);
+ }
+
// As currently implemented in clang, NVPTX code has no standard library to
// speak of. Headers provide a standard-ish library implementation, but many
// of the signatures are wrong -- for example, many libm functions are not
diff --git a/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp b/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp
index 5d7c97a..6356d71 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp
@@ -37,8 +37,8 @@ void AIXException::emitExceptionInfoTable(const MCSymbol *LSDA,
// unsigned long personality; /* Pointer to the personality routine */
// }
- auto *EHInfo =
- cast<MCSectionXCOFF>(Asm->getObjFileLowering().getCompactUnwindSection());
+ auto *EHInfo = static_cast<MCSectionXCOFF *>(
+ Asm->getObjFileLowering().getCompactUnwindSection());
if (Asm->TM.getFunctionSections()) {
// If option -ffunction-sections is on, append the function name to the
// name of EH Info Table csect so that each function has its own EH Info
diff --git a/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp b/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp
index de6ebcf..51342c6 100644
--- a/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp
@@ -39,7 +39,7 @@ void ARMException::beginFunction(const MachineFunction *MF) {
if (CFISecType == AsmPrinter::CFISection::Debug) {
if (!hasEmittedCFISections) {
if (Asm->getModuleCFISectionType() == AsmPrinter::CFISection::Debug)
- Asm->OutStreamer->emitCFISections(false, true);
+ Asm->OutStreamer->emitCFISections(false, true, false);
hasEmittedCFISections = true;
}
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index f1d3e96..6166271 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -4221,10 +4221,11 @@ MCSymbol *AsmPrinter::GetCPISymbol(unsigned CPID) const {
SectionKind Kind = CPE.getSectionKind(&DL);
const Constant *C = CPE.Val.ConstVal;
Align Alignment = CPE.Alignment;
- if (const MCSectionCOFF *S = dyn_cast<MCSectionCOFF>(
- getObjFileLowering().getSectionForConstant(DL, Kind, C,
- Alignment))) {
- if (MCSymbol *Sym = S->getCOMDATSymbol()) {
+ auto *S =
+ getObjFileLowering().getSectionForConstant(DL, Kind, C, Alignment);
+ if (S && TM.getTargetTriple().isOSBinFormatCOFF()) {
+ if (MCSymbol *Sym =
+ static_cast<const MCSectionCOFF *>(S)->getCOMDATSymbol()) {
if (Sym->isUndefined())
OutStreamer->emitSymbolAttribute(Sym, MCSA_Global);
return Sym;
diff --git a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index 8abeb56..c5d6e40 100644
--- a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -1051,10 +1051,10 @@ void CodeViewDebug::switchToDebugSectionForSymbol(const MCSymbol *GVSym) {
// comdat key. A section may be comdat because of -ffunction-sections or
// because it is comdat in the IR.
MCSectionCOFF *GVSec =
- GVSym ? dyn_cast<MCSectionCOFF>(&GVSym->getSection()) : nullptr;
+ GVSym ? static_cast<MCSectionCOFF *>(&GVSym->getSection()) : nullptr;
const MCSymbol *KeySym = GVSec ? GVSec->getCOMDATSymbol() : nullptr;
- MCSectionCOFF *DebugSec = cast<MCSectionCOFF>(
+ auto *DebugSec = static_cast<MCSectionCOFF *>(
CompilerInfoAsm->getObjFileLowering().getCOFFDebugSymbolsSection());
DebugSec = OS.getContext().getAssociativeCOFFSection(DebugSec, KeySym);
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
index 4fac4bb..6b8d08c 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
@@ -109,9 +109,11 @@ void DwarfCFIException::beginBasicBlockSection(const MachineBasicBlock &MBB) {
// chose not to be verbose in that case. And with `ForceDwarfFrameSection`,
// we should always emit .debug_frame.
if (CFISecType == AsmPrinter::CFISection::Debug ||
- Asm->TM.Options.ForceDwarfFrameSection)
+ Asm->TM.Options.ForceDwarfFrameSection ||
+ Asm->TM.Options.MCOptions.EmitSFrameUnwind)
Asm->OutStreamer->emitCFISections(
- CFISecType == AsmPrinter::CFISection::EH, true);
+ CFISecType == AsmPrinter::CFISection::EH, true,
+ Asm->TM.Options.MCOptions.EmitSFrameUnwind);
hasEmittedCFISections = true;
}
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index 8e8cda4..5577a7d 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -1379,7 +1379,7 @@ void DwarfCompileUnit::constructCallSiteParmEntryDIEs(
DIE *DwarfCompileUnit::constructImportedEntityDIE(
const DIImportedEntity *Module) {
- DIE *IMDie = DIE::get(DIEValueAllocator, (dwarf::Tag)Module->getTag());
+ DIE *IMDie = DIE::get(DIEValueAllocator, Module->getTag());
insertDIE(Module, IMDie);
DIE *EntityDie;
auto *Entity = Module->getEntity();
diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp
index c3b4077..989cf4c4 100644
--- a/llvm/lib/CodeGen/CodeGen.cpp
+++ b/llvm/lib/CodeGen/CodeGen.cpp
@@ -45,7 +45,6 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
initializeExpandPostRALegacyPass(Registry);
initializeFEntryInserterLegacyPass(Registry);
initializeFinalizeISelPass(Registry);
- initializeFinalizeMachineBundlesPass(Registry);
initializeFixupStatepointCallerSavedLegacyPass(Registry);
initializeFuncletLayoutPass(Registry);
initializeGCMachineCodeAnalysisPass(Registry);
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index c21058c..416c56d 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -2095,6 +2095,10 @@ static bool isRemOfLoopIncrementWithLoopInvariant(
if (!L->isLoopInvariant(RemAmt))
return false;
+ // Only works if the AddOffset is a loop invaraint
+ if (AddOffset && !L->isLoopInvariant(AddOffset))
+ return false;
+
// Is the PHI a loop increment?
auto LoopIncrInfo = getIVIncrement(PN, LI);
if (!LoopIncrInfo)
diff --git a/llvm/lib/CodeGen/ExpandFp.cpp b/llvm/lib/CodeGen/ExpandFp.cpp
index 714ec55..1c1047c 100644
--- a/llvm/lib/CodeGen/ExpandFp.cpp
+++ b/llvm/lib/CodeGen/ExpandFp.cpp
@@ -103,10 +103,10 @@ static void expandFPToI(Instruction *FPToI) {
Value *A1 = nullptr;
if (FloatVal->getType()->isHalfTy()) {
if (FPToI->getOpcode() == Instruction::FPToUI) {
- Value *A0 = Builder.CreateFPToUI(FloatVal, Builder.getIntNTy(32));
+ Value *A0 = Builder.CreateFPToUI(FloatVal, Builder.getInt32Ty());
A1 = Builder.CreateZExt(A0, IntTy);
} else { // FPToSI
- Value *A0 = Builder.CreateFPToSI(FloatVal, Builder.getIntNTy(32));
+ Value *A0 = Builder.CreateFPToSI(FloatVal, Builder.getInt32Ty());
A1 = Builder.CreateSExt(A0, IntTy);
}
FPToI->replaceAllUsesWith(A1);
@@ -425,8 +425,8 @@ static void expandIToFP(Instruction *IToFP) {
AAddr0->addIncoming(IsSigned ? Sub : IntVal, IfThen4);
AAddr0->addIncoming(Shl, SwBB);
Value *A0 = Builder.CreateTrunc(AAddr0, Builder.getInt32Ty());
- Value *A1 = Builder.CreateLShr(A0, Builder.getIntN(32, 2));
- Value *A2 = Builder.CreateAnd(A1, Builder.getIntN(32, 1));
+ Value *A1 = Builder.CreateLShr(A0, Builder.getInt32(2));
+ Value *A2 = Builder.CreateAnd(A1, Builder.getInt32(1));
Value *Conv16 = Builder.CreateZExt(A2, IntTy);
Value *Or17 = Builder.CreateOr(AAddr0, Conv16);
Value *Inc = Builder.CreateAdd(Or17, Builder.getIntN(BitWidth, 1));
@@ -457,9 +457,9 @@ static void expandIToFP(Instruction *IToFP) {
Value *Extract = Builder.CreateLShr(Shr21, Builder.getIntN(BitWidth, 32));
Value *ExtractT62 = nullptr;
if (FloatWidth > 80)
- ExtractT62 = Builder.CreateTrunc(Sub1, Builder.getIntNTy(64));
+ ExtractT62 = Builder.CreateTrunc(Sub1, Builder.getInt64Ty());
else
- ExtractT62 = Builder.CreateTrunc(Extract, Builder.getIntNTy(32));
+ ExtractT62 = Builder.CreateTrunc(Extract, Builder.getInt32Ty());
Builder.CreateBr(IfEnd26);
// if.else:
@@ -475,7 +475,7 @@ static void expandIToFP(Instruction *IToFP) {
Value *Extract65 = Builder.CreateLShr(Shl26, Builder.getIntN(BitWidth, 32));
Value *ExtractT66 = nullptr;
if (FloatWidth > 80)
- ExtractT66 = Builder.CreateTrunc(Sub2, Builder.getIntNTy(64));
+ ExtractT66 = Builder.CreateTrunc(Sub2, Builder.getInt64Ty());
else
ExtractT66 = Builder.CreateTrunc(Extract65, Builder.getInt32Ty());
Builder.CreateBr(IfEnd26);
@@ -507,30 +507,29 @@ static void expandIToFP(Instruction *IToFP) {
Builder.getIntN(BitWidth, 63));
And29 = Builder.CreateAnd(Shr, Temp2, "and29");
} else {
- Value *Conv28 = Builder.CreateTrunc(Shr, Builder.getIntNTy(32));
+ Value *Conv28 = Builder.CreateTrunc(Shr, Builder.getInt32Ty());
And29 = Builder.CreateAnd(
- Conv28, ConstantInt::getSigned(Builder.getIntNTy(32), 0x80000000));
+ Conv28, ConstantInt::getSigned(Builder.getInt32Ty(), 0x80000000));
}
unsigned TempMod = FPMantissaWidth % 32;
Value *And34 = nullptr;
Value *Shl30 = nullptr;
if (FloatWidth > 80) {
TempMod += 32;
- Value *Add = Builder.CreateShl(AAddr1Off32, Builder.getIntN(64, TempMod));
+ Value *Add = Builder.CreateShl(AAddr1Off32, Builder.getInt64(TempMod));
Shl30 = Builder.CreateAdd(
- Add,
- Builder.getIntN(64, ((1ull << (62ull - TempMod)) - 1ull) << TempMod));
- And34 = Builder.CreateZExt(Shl30, Builder.getIntNTy(128));
+ Add, Builder.getInt64(((1ull << (62ull - TempMod)) - 1ull) << TempMod));
+ And34 = Builder.CreateZExt(Shl30, Builder.getInt128Ty());
} else {
- Value *Add = Builder.CreateShl(E0, Builder.getIntN(32, TempMod));
+ Value *Add = Builder.CreateShl(E0, Builder.getInt32(TempMod));
Shl30 = Builder.CreateAdd(
- Add, Builder.getIntN(32, ((1 << (30 - TempMod)) - 1) << TempMod));
+ Add, Builder.getInt32(((1 << (30 - TempMod)) - 1) << TempMod));
And34 = Builder.CreateAnd(FloatWidth > 32 ? AAddr1Off32 : AAddr1Off0,
- Builder.getIntN(32, (1 << TempMod) - 1));
+ Builder.getInt32((1 << TempMod) - 1));
}
Value *Or35 = nullptr;
if (FloatWidth > 80) {
- Value *And29Trunc = Builder.CreateTrunc(And29, Builder.getIntNTy(128));
+ Value *And29Trunc = Builder.CreateTrunc(And29, Builder.getInt128Ty());
Value *Or31 = Builder.CreateOr(And29Trunc, And34);
Value *Or34 = Builder.CreateShl(Or31, Builder.getIntN(128, 64));
Value *Temp3 = Builder.CreateShl(Builder.getIntN(128, 1),
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 1b69188..5e50898 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -253,6 +253,21 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
return false;
}
+static Value *getMaskOperand(IntrinsicInst *II) {
+ switch (II->getIntrinsicID()) {
+ default:
+ llvm_unreachable("Unexpected intrinsic");
+ case Intrinsic::vp_load:
+ return II->getOperand(1);
+ case Intrinsic::masked_load:
+ return II->getOperand(2);
+ case Intrinsic::vp_store:
+ return II->getOperand(2);
+ case Intrinsic::masked_store:
+ return II->getOperand(3);
+ }
+}
+
// Return the corresponded deinterleaved mask, or nullptr if there is no valid
// mask.
static Value *getMask(Value *WideMask, unsigned Factor,
@@ -268,8 +283,12 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
if (isa<ScalableVectorType>(Load->getType()))
return false;
- if (auto *LI = dyn_cast<LoadInst>(Load);
- LI && !LI->isSimple())
+ auto *LI = dyn_cast<LoadInst>(Load);
+ auto *II = dyn_cast<IntrinsicInst>(Load);
+ if (!LI && !II)
+ return false;
+
+ if (LI && !LI->isSimple())
return false;
// Check if all users of this load are shufflevectors. If we encounter any
@@ -322,7 +341,7 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
// Holds the corresponding index for each DE-interleave shuffle.
SmallVector<unsigned, 4> Indices;
- Type *VecTy = FirstSVI->getType();
+ VectorType *VecTy = cast<VectorType>(FirstSVI->getType());
// Check if other shufflevectors are also DE-interleaved of the same type
// and factor as the first shufflevector.
@@ -360,13 +379,16 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, Load);
Value *Mask = nullptr;
- if (auto *VPLoad = dyn_cast<VPIntrinsic>(Load)) {
- Mask = getMask(VPLoad->getMaskParam(), Factor, cast<VectorType>(VecTy));
+ if (LI) {
+ LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *Load << "\n");
+ } else {
+ // Check mask operand. Handle both all-true/false and interleaved mask.
+ Mask = getMask(getMaskOperand(II), Factor, VecTy);
if (!Mask)
return false;
- LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.load: " << *Load << "\n");
- } else {
- LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *Load << "\n");
+
+ LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.load or masked.load: "
+ << *Load << "\n");
}
// Try to create target specific intrinsics to replace the load and
@@ -483,15 +505,16 @@ bool InterleavedAccessImpl::tryReplaceExtracts(
bool InterleavedAccessImpl::lowerInterleavedStore(
Instruction *Store, SmallSetVector<Instruction *, 32> &DeadInsts) {
Value *StoredValue;
- if (auto *SI = dyn_cast<StoreInst>(Store)) {
+ auto *SI = dyn_cast<StoreInst>(Store);
+ auto *II = dyn_cast<IntrinsicInst>(Store);
+ if (SI) {
if (!SI->isSimple())
return false;
StoredValue = SI->getValueOperand();
- } else if (auto *VPStore = dyn_cast<VPIntrinsic>(Store)) {
- assert(VPStore->getIntrinsicID() == Intrinsic::vp_store);
- StoredValue = VPStore->getArgOperand(0);
} else {
- llvm_unreachable("unsupported store operation");
+ assert(II->getIntrinsicID() == Intrinsic::vp_store ||
+ II->getIntrinsicID() == Intrinsic::masked_store);
+ StoredValue = II->getArgOperand(0);
}
auto *SVI = dyn_cast<ShuffleVectorInst>(StoredValue);
@@ -508,18 +531,18 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
"number of stored element should be a multiple of Factor");
Value *Mask = nullptr;
- if (auto *VPStore = dyn_cast<VPIntrinsic>(Store)) {
+ if (SI) {
+ LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *Store << "\n");
+ } else {
+ // Check mask operand. Handle both all-true/false and interleaved mask.
unsigned LaneMaskLen = NumStoredElements / Factor;
- Mask = getMask(VPStore->getMaskParam(), Factor,
+ Mask = getMask(getMaskOperand(II), Factor,
ElementCount::getFixed(LaneMaskLen));
if (!Mask)
return false;
- LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.store: " << *Store
- << "\n");
-
- } else {
- LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *Store << "\n");
+ LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.store or masked.store: "
+ << *Store << "\n");
}
// Try to create target specific intrinsics to replace the store and
@@ -564,6 +587,27 @@ static Value *getMask(Value *WideMask, unsigned Factor,
}
}
+ if (auto *SVI = dyn_cast<ShuffleVectorInst>(WideMask)) {
+ // Check that the shuffle mask is: a) an interleave, b) all of the same
+ // set of the elements, and c) contained by the first source. (c) could
+ // be relaxed if desired.
+ unsigned NumSrcElts =
+ cast<FixedVectorType>(SVI->getOperand(1)->getType())->getNumElements();
+ SmallVector<unsigned> StartIndexes;
+ if (ShuffleVectorInst::isInterleaveMask(SVI->getShuffleMask(), Factor,
+ NumSrcElts * 2, StartIndexes) &&
+ llvm::all_of(StartIndexes, [](unsigned Start) { return Start == 0; }) &&
+ llvm::all_of(SVI->getShuffleMask(), [&NumSrcElts](int Idx) {
+ return Idx < (int)NumSrcElts;
+ })) {
+ auto *LeafMaskTy =
+ VectorType::get(Type::getInt1Ty(SVI->getContext()), LeafValueEC);
+ IRBuilder<> Builder(SVI);
+ return Builder.CreateExtractVector(LeafMaskTy, SVI->getOperand(0),
+ uint64_t(0));
+ }
+ }
+
return nullptr;
}
@@ -590,21 +634,12 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
<< " and factor = " << Factor << "\n");
} else {
assert(II);
-
- // Check mask operand. Handle both all-true/false and interleaved mask.
- Value *WideMask;
- switch (II->getIntrinsicID()) {
- default:
+ if (II->getIntrinsicID() != Intrinsic::masked_load &&
+ II->getIntrinsicID() != Intrinsic::vp_load)
return false;
- case Intrinsic::vp_load:
- WideMask = II->getOperand(1);
- break;
- case Intrinsic::masked_load:
- WideMask = II->getOperand(2);
- break;
- }
- Mask = getMask(WideMask, Factor, getDeinterleavedVectorType(DI));
+ // Check mask operand. Handle both all-true/false and interleaved mask.
+ Mask = getMask(getMaskOperand(II), Factor, getDeinterleavedVectorType(DI));
if (!Mask)
return false;
@@ -641,19 +676,11 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
Value *Mask = nullptr;
if (II) {
- // Check mask operand. Handle both all-true/false and interleaved mask.
- Value *WideMask;
- switch (II->getIntrinsicID()) {
- default:
+ if (II->getIntrinsicID() != Intrinsic::masked_store &&
+ II->getIntrinsicID() != Intrinsic::vp_store)
return false;
- case Intrinsic::vp_store:
- WideMask = II->getOperand(2);
- break;
- case Intrinsic::masked_store:
- WideMask = II->getOperand(3);
- break;
- }
- Mask = getMask(WideMask, Factor,
+ // Check mask operand. Handle both all-true/false and interleaved mask.
+ Mask = getMask(getMaskOperand(II), Factor,
cast<VectorType>(InterleaveValues[0]->getType()));
if (!Mask)
return false;
@@ -687,11 +714,13 @@ bool InterleavedAccessImpl::runOnFunction(Function &F) {
using namespace PatternMatch;
for (auto &I : instructions(F)) {
if (match(&I, m_CombineOr(m_Load(m_Value()),
- m_Intrinsic<Intrinsic::vp_load>())))
+ m_Intrinsic<Intrinsic::vp_load>())) ||
+ match(&I, m_Intrinsic<Intrinsic::masked_load>()))
Changed |= lowerInterleavedLoad(&I, DeadInsts);
if (match(&I, m_CombineOr(m_Store(m_Value(), m_Value()),
- m_Intrinsic<Intrinsic::vp_store>())))
+ m_Intrinsic<Intrinsic::vp_store>())) ||
+ match(&I, m_Intrinsic<Intrinsic::masked_store>()))
Changed |= lowerInterleavedStore(&I, DeadInsts);
if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index 7710b50..bc4e299 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -815,6 +815,9 @@ static void printMI(raw_ostream &OS, MFPrintState &State,
if (MI.getFlag(MachineInstr::SameSign))
OS << "samesign ";
+ // NOTE: Please add new MIFlags also to the MI_FLAGS_STR in
+ // llvm/utils/update_mir_test_checks.py.
+
OS << TII->getName(MI.getOpcode());
LS = ListSeparator();
diff --git a/llvm/lib/CodeGen/MachineInstrBundle.cpp b/llvm/lib/CodeGen/MachineInstrBundle.cpp
index 44b648a..4da0184 100644
--- a/llvm/lib/CodeGen/MachineInstrBundle.cpp
+++ b/llvm/lib/CodeGen/MachineInstrBundle.cpp
@@ -83,27 +83,6 @@ llvm::createUnpackMachineBundles(
return new UnpackMachineBundles(std::move(Ftor));
}
-namespace {
- class FinalizeMachineBundles : public MachineFunctionPass {
- public:
- static char ID; // Pass identification
- FinalizeMachineBundles() : MachineFunctionPass(ID) {
- initializeFinalizeMachineBundlesPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnMachineFunction(MachineFunction &MF) override;
- };
-} // end anonymous namespace
-
-char FinalizeMachineBundles::ID = 0;
-char &llvm::FinalizeMachineBundlesID = FinalizeMachineBundles::ID;
-INITIALIZE_PASS(FinalizeMachineBundles, "finalize-mi-bundles",
- "Finalize machine instruction bundles", false, false)
-
-bool FinalizeMachineBundles::runOnMachineFunction(MachineFunction &MF) {
- return llvm::finalizeBundles(MF);
-}
-
/// Return the first found DebugLoc that has a DILocation, given a range of
/// instructions. The search range is from FirstMI to LastMI (exclusive). If no
/// DILocation is found, then an empty location is returned.
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index e144111..286fbfd 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -49,7 +49,6 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
#include <cassert>
#include <limits>
#include <vector>
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index e5704c0..583a85a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -22,6 +22,7 @@
#include "llvm/CodeGen/StackMaps.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/raw_ostream.h"
@@ -357,6 +358,9 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
case ISD::PATCHPOINT:
Res = PromoteIntRes_PATCHPOINT(N);
break;
+ case ISD::READ_REGISTER:
+ Res = PromoteIntRes_READ_REGISTER(N);
+ break;
}
// If the result is null then the sub-method took care of registering it.
@@ -2076,6 +2080,9 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
case ISD::PATCHPOINT:
Res = PromoteIntOp_PATCHPOINT(N, OpNo);
break;
+ case ISD::WRITE_REGISTER:
+ Res = PromoteIntOp_WRITE_REGISTER(N, OpNo);
+ break;
case ISD::EXPERIMENTAL_VP_STRIDED_LOAD:
case ISD::EXPERIMENTAL_VP_STRIDED_STORE:
Res = PromoteIntOp_VP_STRIDED(N, OpNo);
@@ -2853,6 +2860,15 @@ SDValue DAGTypeLegalizer::PromoteIntOp_PATCHPOINT(SDNode *N, unsigned OpNo) {
return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
}
+SDValue DAGTypeLegalizer::PromoteIntOp_WRITE_REGISTER(SDNode *N,
+ unsigned OpNo) {
+ const Function &Fn = DAG.getMachineFunction().getFunction();
+ Fn.getContext().diagnose(DiagnosticInfoLegalizationFailure(
+ "cannot use llvm.write_register with illegal type", Fn,
+ N->getDebugLoc()));
+ return N->getOperand(0);
+}
+
SDValue DAGTypeLegalizer::PromoteIntOp_VP_STRIDED(SDNode *N, unsigned OpNo) {
assert((N->getOpcode() == ISD::EXPERIMENTAL_VP_STRIDED_LOAD && OpNo == 3) ||
(N->getOpcode() == ISD::EXPERIMENTAL_VP_STRIDED_STORE && OpNo == 4));
@@ -3127,6 +3143,10 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
case ISD::VSCALE:
ExpandIntRes_VSCALE(N, Lo, Hi);
break;
+
+ case ISD::READ_REGISTER:
+ ExpandIntRes_READ_REGISTER(N, Lo, Hi);
+ break;
}
// If Lo/Hi is null, the sub-method took care of registering results etc.
@@ -5471,6 +5491,18 @@ void DAGTypeLegalizer::ExpandIntRes_VSCALE(SDNode *N, SDValue &Lo,
SplitInteger(Res, Lo, Hi);
}
+void DAGTypeLegalizer::ExpandIntRes_READ_REGISTER(SDNode *N, SDValue &Lo,
+ SDValue &Hi) {
+ const Function &Fn = DAG.getMachineFunction().getFunction();
+ Fn.getContext().diagnose(DiagnosticInfoLegalizationFailure(
+ "cannot use llvm.read_register with illegal type", Fn, N->getDebugLoc()));
+ ReplaceValueWith(SDValue(N, 1), N->getOperand(0));
+ EVT LoVT, HiVT;
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+ Lo = DAG.getPOISON(LoVT);
+ Hi = DAG.getPOISON(HiVT);
+}
+
//===----------------------------------------------------------------------===//
// Integer Operand Expansion
//===----------------------------------------------------------------------===//
@@ -5537,6 +5569,9 @@ bool DAGTypeLegalizer::ExpandIntegerOperand(SDNode *N, unsigned OpNo) {
case ISD::EXPERIMENTAL_VP_STRIDED_STORE:
Res = ExpandIntOp_VP_STRIDED(N, OpNo);
break;
+ case ISD::WRITE_REGISTER:
+ Res = ExpandIntOp_WRITE_REGISTER(N, OpNo);
+ break;
}
// If the result is null, the sub-method took care of registering results etc.
@@ -5935,6 +5970,15 @@ SDValue DAGTypeLegalizer::ExpandIntOp_VP_STRIDED(SDNode *N, unsigned OpNo) {
return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
}
+SDValue DAGTypeLegalizer::ExpandIntOp_WRITE_REGISTER(SDNode *N, unsigned OpNo) {
+ const Function &Fn = DAG.getMachineFunction().getFunction();
+ Fn.getContext().diagnose(DiagnosticInfoLegalizationFailure(
+ "cannot use llvm.write_register with illegal type", Fn,
+ N->getDebugLoc()));
+
+ return N->getOperand(0);
+}
+
SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_SPLICE(SDNode *N) {
SDLoc dl(N);
@@ -6332,6 +6376,16 @@ SDValue DAGTypeLegalizer::PromoteIntRes_PATCHPOINT(SDNode *N) {
return Res.getValue(0);
}
+SDValue DAGTypeLegalizer::PromoteIntRes_READ_REGISTER(SDNode *N) {
+ const Function &Fn = DAG.getMachineFunction().getFunction();
+ Fn.getContext().diagnose(DiagnosticInfoLegalizationFailure(
+ "cannot use llvm.read_register with illegal type", Fn, N->getDebugLoc()));
+
+ EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+ ReplaceValueWith(SDValue(N, 1), N->getOperand(0));
+ return DAG.getPOISON(NVT);
+}
+
SDValue DAGTypeLegalizer::PromoteIntOp_EXTRACT_VECTOR_ELT(SDNode *N) {
SDLoc dl(N);
SDValue V0 = GetPromotedInteger(N->getOperand(0));
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 9b53724..2e13b18 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -378,6 +378,7 @@ private:
SDValue PromoteIntRes_VPFunnelShift(SDNode *N);
SDValue PromoteIntRes_IS_FPCLASS(SDNode *N);
SDValue PromoteIntRes_PATCHPOINT(SDNode *N);
+ SDValue PromoteIntRes_READ_REGISTER(SDNode *N);
SDValue PromoteIntRes_VECTOR_FIND_LAST_ACTIVE(SDNode *N);
SDValue PromoteIntRes_GET_ACTIVE_LANE_MASK(SDNode *N);
SDValue PromoteIntRes_PARTIAL_REDUCE_MLA(SDNode *N);
@@ -428,6 +429,7 @@ private:
SDValue PromoteIntOp_SET_ROUNDING(SDNode *N);
SDValue PromoteIntOp_STACKMAP(SDNode *N, unsigned OpNo);
SDValue PromoteIntOp_PATCHPOINT(SDNode *N, unsigned OpNo);
+ SDValue PromoteIntOp_WRITE_REGISTER(SDNode *N, unsigned OpNo);
SDValue PromoteIntOp_VP_STRIDED(SDNode *N, unsigned OpNo);
SDValue PromoteIntOp_VP_SPLICE(SDNode *N, unsigned OpNo);
SDValue PromoteIntOp_VECTOR_HISTOGRAM(SDNode *N, unsigned OpNo);
@@ -511,6 +513,7 @@ private:
void ExpandIntRes_FunnelShift (SDNode *N, SDValue &Lo, SDValue &Hi);
void ExpandIntRes_VSCALE (SDNode *N, SDValue &Lo, SDValue &Hi);
+ void ExpandIntRes_READ_REGISTER(SDNode *N, SDValue &Lo, SDValue &Hi);
void ExpandShiftByConstant(SDNode *N, const APInt &Amt,
SDValue &Lo, SDValue &Hi);
@@ -534,6 +537,7 @@ private:
SDValue ExpandIntOp_STACKMAP(SDNode *N, unsigned OpNo);
SDValue ExpandIntOp_PATCHPOINT(SDNode *N, unsigned OpNo);
SDValue ExpandIntOp_VP_STRIDED(SDNode *N, unsigned OpNo);
+ SDValue ExpandIntOp_WRITE_REGISTER(SDNode *N, unsigned OpNo);
void IntegerExpandSetCCOperands(SDValue &NewLHS, SDValue &NewRHS,
ISD::CondCode &CCCode, const SDLoc &dl);
diff --git a/llvm/lib/CodeGen/StackProtector.cpp b/llvm/lib/CodeGen/StackProtector.cpp
index b79911b..2a8234a 100644
--- a/llvm/lib/CodeGen/StackProtector.cpp
+++ b/llvm/lib/CodeGen/StackProtector.cpp
@@ -588,7 +588,14 @@ bool InsertStackProtectors(const TargetMachine *TM, Function *F,
continue;
Instruction *CheckLoc = dyn_cast<ReturnInst>(BB.getTerminator());
if (!CheckLoc && !DisableCheckNoReturn)
- for (auto &Inst : BB)
+ for (auto &Inst : BB) {
+ if (IntrinsicInst *IB = dyn_cast<IntrinsicInst>(&Inst);
+ IB && (IB->getIntrinsicID() == Intrinsic::eh_sjlj_callsite)) {
+ // eh_sjlj_callsite has to be in same BB as the
+ // bb terminator. Don't insert within this range.
+ CheckLoc = IB;
+ break;
+ }
if (auto *CB = dyn_cast<CallBase>(&Inst))
// Do stack check before noreturn calls that aren't nounwind (e.g:
// __cxa_throw).
@@ -596,6 +603,7 @@ bool InsertStackProtectors(const TargetMachine *TM, Function *F,
CheckLoc = CB;
break;
}
+ }
if (!CheckLoc)
continue;
diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 7e501a9..408d07b 100644
--- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -42,7 +42,6 @@
#include "llvm/IR/Mangler.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Module.h"
-#include "llvm/IR/PseudoProbe.h"
#include "llvm/IR/Type.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCAsmInfoDarwin.h"
@@ -996,7 +995,7 @@ MCSection *TargetLoweringObjectFileELF::getSectionForLSDA(
if (!LSDASection || (!F.hasComdat() && !TM.getFunctionSections()))
return LSDASection;
- const auto *LSDA = cast<MCSectionELF>(LSDASection);
+ const auto *LSDA = static_cast<const MCSectionELF *>(LSDASection);
unsigned Flags = LSDA->getFlags();
const MCSymbolELF *LinkedToSym = nullptr;
StringRef Group;
@@ -2055,14 +2054,14 @@ MCSection *TargetLoweringObjectFileCOFF::getStaticCtorSection(
unsigned Priority, const MCSymbol *KeySym) const {
return getCOFFStaticStructorSection(
getContext(), getContext().getTargetTriple(), true, Priority, KeySym,
- cast<MCSectionCOFF>(StaticCtorSection));
+ static_cast<MCSectionCOFF *>(StaticCtorSection));
}
MCSection *TargetLoweringObjectFileCOFF::getStaticDtorSection(
unsigned Priority, const MCSymbol *KeySym) const {
return getCOFFStaticStructorSection(
getContext(), getContext().getTargetTriple(), false, Priority, KeySym,
- cast<MCSectionCOFF>(StaticDtorSection));
+ static_cast<MCSectionCOFF *>(StaticDtorSection));
}
const MCExpr *TargetLoweringObjectFileCOFF::lowerRelativeReference(
@@ -2389,23 +2388,25 @@ TargetLoweringObjectFileXCOFF::getTargetSymbol(const GlobalValue *GV,
// here.
if (const GlobalObject *GO = dyn_cast<GlobalObject>(GV)) {
if (GO->isDeclarationForLinker())
- return cast<MCSectionXCOFF>(getSectionForExternalReference(GO, TM))
+ return static_cast<const MCSectionXCOFF *>(
+ getSectionForExternalReference(GO, TM))
->getQualNameSymbol();
if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
if (GVar->hasAttribute("toc-data"))
- return cast<MCSectionXCOFF>(
+ return static_cast<const MCSectionXCOFF *>(
SectionForGlobal(GVar, SectionKind::getData(), TM))
->getQualNameSymbol();
SectionKind GOKind = getKindForGlobal(GO, TM);
if (GOKind.isText())
- return cast<MCSectionXCOFF>(
+ return static_cast<const MCSectionXCOFF *>(
getSectionForFunctionDescriptor(cast<Function>(GO), TM))
->getQualNameSymbol();
if ((TM.getDataSections() && !GO->hasSection()) || GO->hasCommonLinkage() ||
GOKind.isBSSLocal() || GOKind.isThreadBSSLocal())
- return cast<MCSectionXCOFF>(SectionForGlobal(GO, GOKind, TM))
+ return static_cast<const MCSectionXCOFF *>(
+ SectionForGlobal(GO, GOKind, TM))
->getQualNameSymbol();
}
@@ -2741,7 +2742,7 @@ MCSection *TargetLoweringObjectFileXCOFF::getSectionForTOCEntry(
MCSection *TargetLoweringObjectFileXCOFF::getSectionForLSDA(
const Function &F, const MCSymbol &FnSym, const TargetMachine &TM) const {
- auto *LSDA = cast<MCSectionXCOFF>(LSDASection);
+ auto *LSDA = static_cast<MCSectionXCOFF *>(LSDASection);
if (TM.getFunctionSections()) {
// If option -ffunction-sections is on, append the function name to the
// name of the LSDA csect so that each function has its own LSDA csect.
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFCFIPrinter.cpp b/llvm/lib/DebugInfo/DWARF/DWARFCFIPrinter.cpp
index 2abab02..4d879b6 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFCFIPrinter.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFCFIPrinter.cpp
@@ -8,12 +8,9 @@
#include "llvm/DebugInfo/DWARF/DWARFCFIPrinter.h"
#include "llvm/DebugInfo/DIContext.h"
-#include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
#include "llvm/DebugInfo/DWARF/DWARFExpressionPrinter.h"
#include "llvm/DebugInfo/DWARF/LowLevel/DWARFCFIProgram.h"
#include "llvm/Support/Compiler.h"
-#include "llvm/Support/DataExtractor.h"
-#include "llvm/Support/Errc.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/Format.h"
#include "llvm/Support/raw_ostream.h"
diff --git a/llvm/lib/DebugInfo/DWARF/LowLevel/DWARFExpression.cpp b/llvm/lib/DebugInfo/DWARF/LowLevel/DWARFExpression.cpp
index 7072418..9a7f7d1 100644
--- a/llvm/lib/DebugInfo/DWARF/LowLevel/DWARFExpression.cpp
+++ b/llvm/lib/DebugInfo/DWARF/LowLevel/DWARFExpression.cpp
@@ -7,8 +7,6 @@
//===----------------------------------------------------------------------===//
#include "llvm/DebugInfo/DWARF/LowLevel/DWARFExpression.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/Support/Format.h"
#include <cassert>
#include <cstdint>
#include <vector>
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
index b3798f1..a8559e7 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
@@ -183,7 +183,7 @@ RuntimeDyldImpl::loadObjectImpl(const object::ObjectFile &Obj) {
std::lock_guard<sys::Mutex> locked(lock);
// Save information about our target
- Arch = (Triple::ArchType)Obj.getArch();
+ Arch = Obj.getArch();
IsTargetLittleEndian = Obj.isLittleEndian();
setMipsABI(Obj);
@@ -1361,18 +1361,17 @@ std::unique_ptr<RuntimeDyld::LoadedObjectInfo>
RuntimeDyld::loadObject(const ObjectFile &Obj) {
if (!Dyld) {
if (Obj.isELF())
- Dyld =
- createRuntimeDyldELF(static_cast<Triple::ArchType>(Obj.getArch()),
- MemMgr, Resolver, ProcessAllSections,
- std::move(NotifyStubEmitted));
+ Dyld = createRuntimeDyldELF(Obj.getArch(), MemMgr, Resolver,
+ ProcessAllSections,
+ std::move(NotifyStubEmitted));
else if (Obj.isMachO())
- Dyld = createRuntimeDyldMachO(
- static_cast<Triple::ArchType>(Obj.getArch()), MemMgr, Resolver,
- ProcessAllSections, std::move(NotifyStubEmitted));
+ Dyld = createRuntimeDyldMachO(Obj.getArch(), MemMgr, Resolver,
+ ProcessAllSections,
+ std::move(NotifyStubEmitted));
else if (Obj.isCOFF())
- Dyld = createRuntimeDyldCOFF(
- static_cast<Triple::ArchType>(Obj.getArch()), MemMgr, Resolver,
- ProcessAllSections, std::move(NotifyStubEmitted));
+ Dyld = createRuntimeDyldCOFF(Obj.getArch(), MemMgr, Resolver,
+ ProcessAllSections,
+ std::move(NotifyStubEmitted));
else
report_fatal_error("Incompatible object format!");
}
diff --git a/llvm/lib/FileCheck/FileCheck.cpp b/llvm/lib/FileCheck/FileCheck.cpp
index b79f6ec..ce35a5b 100644
--- a/llvm/lib/FileCheck/FileCheck.cpp
+++ b/llvm/lib/FileCheck/FileCheck.cpp
@@ -1360,6 +1360,12 @@ void Pattern::printFuzzyMatch(const SourceMgr &SM, StringRef Buffer,
size_t Best = StringRef::npos;
double BestQuality = 0;
+ // Arbitrarily limit quadratic search behavior stemming from long CHECK lines.
+ if (size_t(4096) * size_t(2048) <
+ std::min(size_t(4096), Buffer.size()) *
+ std::max(FixedStr.size(), RegExStr.size()))
+ return;
+
// Use an arbitrary 4k limit on how far we will search.
for (size_t i = 0, e = std::min(size_t(4096), Buffer.size()); i != e; ++i) {
if (Buffer[i] == '\n')
diff --git a/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp b/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp
index f7669f0..53f5934 100644
--- a/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp
+++ b/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp
@@ -12,6 +12,8 @@
//===----------------------------------------------------------------------===//
#include "llvm/Frontend/HLSL/RootSignatureMetadata.h"
+#include "llvm/Frontend/HLSL/RootSignatureValidations.h"
+#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Metadata.h"
#include "llvm/Support/ScopedPrinter.h"
@@ -20,6 +22,42 @@ namespace llvm {
namespace hlsl {
namespace rootsig {
+static std::optional<uint32_t> extractMdIntValue(MDNode *Node,
+ unsigned int OpId) {
+ if (auto *CI =
+ mdconst::dyn_extract<ConstantInt>(Node->getOperand(OpId).get()))
+ return CI->getZExtValue();
+ return std::nullopt;
+}
+
+static std::optional<float> extractMdFloatValue(MDNode *Node,
+ unsigned int OpId) {
+ if (auto *CI = mdconst::dyn_extract<ConstantFP>(Node->getOperand(OpId).get()))
+ return CI->getValueAPF().convertToFloat();
+ return std::nullopt;
+}
+
+static std::optional<StringRef> extractMdStringValue(MDNode *Node,
+ unsigned int OpId) {
+ MDString *NodeText = dyn_cast<MDString>(Node->getOperand(OpId));
+ if (NodeText == nullptr)
+ return std::nullopt;
+ return NodeText->getString();
+}
+
+static bool reportError(LLVMContext *Ctx, Twine Message,
+ DiagnosticSeverity Severity = DS_Error) {
+ Ctx->diagnose(DiagnosticInfoGeneric(Message, Severity));
+ return true;
+}
+
+static bool reportValueError(LLVMContext *Ctx, Twine ParamName,
+ uint32_t Value) {
+ Ctx->diagnose(DiagnosticInfoGeneric(
+ "Invalid value for " + ParamName + ": " + Twine(Value), DS_Error));
+ return true;
+}
+
static const EnumEntry<dxil::ResourceClass> ResourceClassNames[] = {
{"CBV", dxil::ResourceClass::CBuffer},
{"SRV", dxil::ResourceClass::SRV},
@@ -189,6 +227,442 @@ MDNode *MetadataBuilder::BuildStaticSampler(const StaticSampler &Sampler) {
return MDNode::get(Ctx, Operands);
}
+bool MetadataParser::parseRootFlags(LLVMContext *Ctx,
+ mcdxbc::RootSignatureDesc &RSD,
+ MDNode *RootFlagNode) {
+
+ if (RootFlagNode->getNumOperands() != 2)
+ return reportError(Ctx, "Invalid format for RootFlag Element");
+
+ if (std::optional<uint32_t> Val = extractMdIntValue(RootFlagNode, 1))
+ RSD.Flags = *Val;
+ else
+ return reportError(Ctx, "Invalid value for RootFlag");
+
+ return false;
+}
+
+bool MetadataParser::parseRootConstants(LLVMContext *Ctx,
+ mcdxbc::RootSignatureDesc &RSD,
+ MDNode *RootConstantNode) {
+
+ if (RootConstantNode->getNumOperands() != 5)
+ return reportError(Ctx, "Invalid format for RootConstants Element");
+
+ dxbc::RTS0::v1::RootParameterHeader Header;
+ // The parameter offset doesn't matter here - we recalculate it during
+ // serialization Header.ParameterOffset = 0;
+ Header.ParameterType =
+ llvm::to_underlying(dxbc::RootParameterType::Constants32Bit);
+
+ if (std::optional<uint32_t> Val = extractMdIntValue(RootConstantNode, 1))
+ Header.ShaderVisibility = *Val;
+ else
+ return reportError(Ctx, "Invalid value for ShaderVisibility");
+
+ dxbc::RTS0::v1::RootConstants Constants;
+ if (std::optional<uint32_t> Val = extractMdIntValue(RootConstantNode, 2))
+ Constants.ShaderRegister = *Val;
+ else
+ return reportError(Ctx, "Invalid value for ShaderRegister");
+
+ if (std::optional<uint32_t> Val = extractMdIntValue(RootConstantNode, 3))
+ Constants.RegisterSpace = *Val;
+ else
+ return reportError(Ctx, "Invalid value for RegisterSpace");
+
+ if (std::optional<uint32_t> Val = extractMdIntValue(RootConstantNode, 4))
+ Constants.Num32BitValues = *Val;
+ else
+ return reportError(Ctx, "Invalid value for Num32BitValues");
+
+ RSD.ParametersContainer.addParameter(Header, Constants);
+
+ return false;
+}
+
+bool MetadataParser::parseRootDescriptors(
+ LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD,
+ MDNode *RootDescriptorNode, RootSignatureElementKind ElementKind) {
+ assert(ElementKind == RootSignatureElementKind::SRV ||
+ ElementKind == RootSignatureElementKind::UAV ||
+ ElementKind == RootSignatureElementKind::CBV &&
+ "parseRootDescriptors should only be called with RootDescriptor "
+ "element kind.");
+ if (RootDescriptorNode->getNumOperands() != 5)
+ return reportError(Ctx, "Invalid format for Root Descriptor Element");
+
+ dxbc::RTS0::v1::RootParameterHeader Header;
+ switch (ElementKind) {
+ case RootSignatureElementKind::SRV:
+ Header.ParameterType = llvm::to_underlying(dxbc::RootParameterType::SRV);
+ break;
+ case RootSignatureElementKind::UAV:
+ Header.ParameterType = llvm::to_underlying(dxbc::RootParameterType::UAV);
+ break;
+ case RootSignatureElementKind::CBV:
+ Header.ParameterType = llvm::to_underlying(dxbc::RootParameterType::CBV);
+ break;
+ default:
+ llvm_unreachable("invalid Root Descriptor kind");
+ break;
+ }
+
+ if (std::optional<uint32_t> Val = extractMdIntValue(RootDescriptorNode, 1))
+ Header.ShaderVisibility = *Val;
+ else
+ return reportError(Ctx, "Invalid value for ShaderVisibility");
+
+ dxbc::RTS0::v2::RootDescriptor Descriptor;
+ if (std::optional<uint32_t> Val = extractMdIntValue(RootDescriptorNode, 2))
+ Descriptor.ShaderRegister = *Val;
+ else
+ return reportError(Ctx, "Invalid value for ShaderRegister");
+
+ if (std::optional<uint32_t> Val = extractMdIntValue(RootDescriptorNode, 3))
+ Descriptor.RegisterSpace = *Val;
+ else
+ return reportError(Ctx, "Invalid value for RegisterSpace");
+
+ if (RSD.Version == 1) {
+ RSD.ParametersContainer.addParameter(Header, Descriptor);
+ return false;
+ }
+ assert(RSD.Version > 1);
+
+ if (std::optional<uint32_t> Val = extractMdIntValue(RootDescriptorNode, 4))
+ Descriptor.Flags = *Val;
+ else
+ return reportError(Ctx, "Invalid value for Root Descriptor Flags");
+
+ RSD.ParametersContainer.addParameter(Header, Descriptor);
+ return false;
+}
+
+bool MetadataParser::parseDescriptorRange(LLVMContext *Ctx,
+ mcdxbc::DescriptorTable &Table,
+ MDNode *RangeDescriptorNode) {
+
+ if (RangeDescriptorNode->getNumOperands() != 6)
+ return reportError(Ctx, "Invalid format for Descriptor Range");
+
+ dxbc::RTS0::v2::DescriptorRange Range;
+
+ std::optional<StringRef> ElementText =
+ extractMdStringValue(RangeDescriptorNode, 0);
+
+ if (!ElementText.has_value())
+ return reportError(Ctx, "Descriptor Range, first element is not a string.");
+
+ Range.RangeType =
+ StringSwitch<uint32_t>(*ElementText)
+ .Case("CBV", llvm::to_underlying(dxbc::DescriptorRangeType::CBV))
+ .Case("SRV", llvm::to_underlying(dxbc::DescriptorRangeType::SRV))
+ .Case("UAV", llvm::to_underlying(dxbc::DescriptorRangeType::UAV))
+ .Case("Sampler",
+ llvm::to_underlying(dxbc::DescriptorRangeType::Sampler))
+ .Default(~0U);
+
+ if (Range.RangeType == ~0U)
+ return reportError(Ctx, "Invalid Descriptor Range type: " + *ElementText);
+
+ if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 1))
+ Range.NumDescriptors = *Val;
+ else
+ return reportError(Ctx, "Invalid value for Number of Descriptor in Range");
+
+ if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 2))
+ Range.BaseShaderRegister = *Val;
+ else
+ return reportError(Ctx, "Invalid value for BaseShaderRegister");
+
+ if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 3))
+ Range.RegisterSpace = *Val;
+ else
+ return reportError(Ctx, "Invalid value for RegisterSpace");
+
+ if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 4))
+ Range.OffsetInDescriptorsFromTableStart = *Val;
+ else
+ return reportError(Ctx,
+ "Invalid value for OffsetInDescriptorsFromTableStart");
+
+ if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 5))
+ Range.Flags = *Val;
+ else
+ return reportError(Ctx, "Invalid value for Descriptor Range Flags");
+
+ Table.Ranges.push_back(Range);
+ return false;
+}
+
+bool MetadataParser::parseDescriptorTable(LLVMContext *Ctx,
+ mcdxbc::RootSignatureDesc &RSD,
+ MDNode *DescriptorTableNode) {
+ const unsigned int NumOperands = DescriptorTableNode->getNumOperands();
+ if (NumOperands < 2)
+ return reportError(Ctx, "Invalid format for Descriptor Table");
+
+ dxbc::RTS0::v1::RootParameterHeader Header;
+ if (std::optional<uint32_t> Val = extractMdIntValue(DescriptorTableNode, 1))
+ Header.ShaderVisibility = *Val;
+ else
+ return reportError(Ctx, "Invalid value for ShaderVisibility");
+
+ mcdxbc::DescriptorTable Table;
+ Header.ParameterType =
+ llvm::to_underlying(dxbc::RootParameterType::DescriptorTable);
+
+ for (unsigned int I = 2; I < NumOperands; I++) {
+ MDNode *Element = dyn_cast<MDNode>(DescriptorTableNode->getOperand(I));
+ if (Element == nullptr)
+ return reportError(Ctx, "Missing Root Element Metadata Node.");
+
+ if (parseDescriptorRange(Ctx, Table, Element))
+ return true;
+ }
+
+ RSD.ParametersContainer.addParameter(Header, Table);
+ return false;
+}
+
+bool MetadataParser::parseStaticSampler(LLVMContext *Ctx,
+ mcdxbc::RootSignatureDesc &RSD,
+ MDNode *StaticSamplerNode) {
+ if (StaticSamplerNode->getNumOperands() != 14)
+ return reportError(Ctx, "Invalid format for Static Sampler");
+
+ dxbc::RTS0::v1::StaticSampler Sampler;
+ if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 1))
+ Sampler.Filter = *Val;
+ else
+ return reportError(Ctx, "Invalid value for Filter");
+
+ if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 2))
+ Sampler.AddressU = *Val;
+ else
+ return reportError(Ctx, "Invalid value for AddressU");
+
+ if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 3))
+ Sampler.AddressV = *Val;
+ else
+ return reportError(Ctx, "Invalid value for AddressV");
+
+ if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 4))
+ Sampler.AddressW = *Val;
+ else
+ return reportError(Ctx, "Invalid value for AddressW");
+
+ if (std::optional<float> Val = extractMdFloatValue(StaticSamplerNode, 5))
+ Sampler.MipLODBias = *Val;
+ else
+ return reportError(Ctx, "Invalid value for MipLODBias");
+
+ if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 6))
+ Sampler.MaxAnisotropy = *Val;
+ else
+ return reportError(Ctx, "Invalid value for MaxAnisotropy");
+
+ if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 7))
+ Sampler.ComparisonFunc = *Val;
+ else
+ return reportError(Ctx, "Invalid value for ComparisonFunc ");
+
+ if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 8))
+ Sampler.BorderColor = *Val;
+ else
+ return reportError(Ctx, "Invalid value for ComparisonFunc ");
+
+ if (std::optional<float> Val = extractMdFloatValue(StaticSamplerNode, 9))
+ Sampler.MinLOD = *Val;
+ else
+ return reportError(Ctx, "Invalid value for MinLOD");
+
+ if (std::optional<float> Val = extractMdFloatValue(StaticSamplerNode, 10))
+ Sampler.MaxLOD = *Val;
+ else
+ return reportError(Ctx, "Invalid value for MaxLOD");
+
+ if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 11))
+ Sampler.ShaderRegister = *Val;
+ else
+ return reportError(Ctx, "Invalid value for ShaderRegister");
+
+ if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 12))
+ Sampler.RegisterSpace = *Val;
+ else
+ return reportError(Ctx, "Invalid value for RegisterSpace");
+
+ if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 13))
+ Sampler.ShaderVisibility = *Val;
+ else
+ return reportError(Ctx, "Invalid value for ShaderVisibility");
+
+ RSD.StaticSamplers.push_back(Sampler);
+ return false;
+}
+
+bool MetadataParser::parseRootSignatureElement(LLVMContext *Ctx,
+ mcdxbc::RootSignatureDesc &RSD,
+ MDNode *Element) {
+ std::optional<StringRef> ElementText = extractMdStringValue(Element, 0);
+ if (!ElementText.has_value())
+ return reportError(Ctx, "Invalid format for Root Element");
+
+ RootSignatureElementKind ElementKind =
+ StringSwitch<RootSignatureElementKind>(*ElementText)
+ .Case("RootFlags", RootSignatureElementKind::RootFlags)
+ .Case("RootConstants", RootSignatureElementKind::RootConstants)
+ .Case("RootCBV", RootSignatureElementKind::CBV)
+ .Case("RootSRV", RootSignatureElementKind::SRV)
+ .Case("RootUAV", RootSignatureElementKind::UAV)
+ .Case("DescriptorTable", RootSignatureElementKind::DescriptorTable)
+ .Case("StaticSampler", RootSignatureElementKind::StaticSamplers)
+ .Default(RootSignatureElementKind::Error);
+
+ switch (ElementKind) {
+
+ case RootSignatureElementKind::RootFlags:
+ return parseRootFlags(Ctx, RSD, Element);
+ case RootSignatureElementKind::RootConstants:
+ return parseRootConstants(Ctx, RSD, Element);
+ case RootSignatureElementKind::CBV:
+ case RootSignatureElementKind::SRV:
+ case RootSignatureElementKind::UAV:
+ return parseRootDescriptors(Ctx, RSD, Element, ElementKind);
+ case RootSignatureElementKind::DescriptorTable:
+ return parseDescriptorTable(Ctx, RSD, Element);
+ case RootSignatureElementKind::StaticSamplers:
+ return parseStaticSampler(Ctx, RSD, Element);
+ case RootSignatureElementKind::Error:
+ return reportError(Ctx, "Invalid Root Signature Element: " + *ElementText);
+ }
+
+ llvm_unreachable("Unhandled RootSignatureElementKind enum.");
+}
+
+bool MetadataParser::validateRootSignature(
+ LLVMContext *Ctx, const llvm::mcdxbc::RootSignatureDesc &RSD) {
+ if (!llvm::hlsl::rootsig::verifyVersion(RSD.Version)) {
+ return reportValueError(Ctx, "Version", RSD.Version);
+ }
+
+ if (!llvm::hlsl::rootsig::verifyRootFlag(RSD.Flags)) {
+ return reportValueError(Ctx, "RootFlags", RSD.Flags);
+ }
+
+ for (const mcdxbc::RootParameterInfo &Info : RSD.ParametersContainer) {
+ if (!dxbc::isValidShaderVisibility(Info.Header.ShaderVisibility))
+ return reportValueError(Ctx, "ShaderVisibility",
+ Info.Header.ShaderVisibility);
+
+ assert(dxbc::isValidParameterType(Info.Header.ParameterType) &&
+ "Invalid value for ParameterType");
+
+ switch (Info.Header.ParameterType) {
+
+ case llvm::to_underlying(dxbc::RootParameterType::CBV):
+ case llvm::to_underlying(dxbc::RootParameterType::UAV):
+ case llvm::to_underlying(dxbc::RootParameterType::SRV): {
+ const dxbc::RTS0::v2::RootDescriptor &Descriptor =
+ RSD.ParametersContainer.getRootDescriptor(Info.Location);
+ if (!llvm::hlsl::rootsig::verifyRegisterValue(Descriptor.ShaderRegister))
+ return reportValueError(Ctx, "ShaderRegister",
+ Descriptor.ShaderRegister);
+
+ if (!llvm::hlsl::rootsig::verifyRegisterSpace(Descriptor.RegisterSpace))
+ return reportValueError(Ctx, "RegisterSpace", Descriptor.RegisterSpace);
+
+ if (RSD.Version > 1) {
+ if (!llvm::hlsl::rootsig::verifyRootDescriptorFlag(RSD.Version,
+ Descriptor.Flags))
+ return reportValueError(Ctx, "RootDescriptorFlag", Descriptor.Flags);
+ }
+ break;
+ }
+ case llvm::to_underlying(dxbc::RootParameterType::DescriptorTable): {
+ const mcdxbc::DescriptorTable &Table =
+ RSD.ParametersContainer.getDescriptorTable(Info.Location);
+ for (const dxbc::RTS0::v2::DescriptorRange &Range : Table) {
+ if (!llvm::hlsl::rootsig::verifyRangeType(Range.RangeType))
+ return reportValueError(Ctx, "RangeType", Range.RangeType);
+
+ if (!llvm::hlsl::rootsig::verifyRegisterSpace(Range.RegisterSpace))
+ return reportValueError(Ctx, "RegisterSpace", Range.RegisterSpace);
+
+ if (!llvm::hlsl::rootsig::verifyNumDescriptors(Range.NumDescriptors))
+ return reportValueError(Ctx, "NumDescriptors", Range.NumDescriptors);
+
+ if (!llvm::hlsl::rootsig::verifyDescriptorRangeFlag(
+ RSD.Version, Range.RangeType, Range.Flags))
+ return reportValueError(Ctx, "DescriptorFlag", Range.Flags);
+ }
+ break;
+ }
+ }
+ }
+
+ for (const dxbc::RTS0::v1::StaticSampler &Sampler : RSD.StaticSamplers) {
+ if (!llvm::hlsl::rootsig::verifySamplerFilter(Sampler.Filter))
+ return reportValueError(Ctx, "Filter", Sampler.Filter);
+
+ if (!llvm::hlsl::rootsig::verifyAddress(Sampler.AddressU))
+ return reportValueError(Ctx, "AddressU", Sampler.AddressU);
+
+ if (!llvm::hlsl::rootsig::verifyAddress(Sampler.AddressV))
+ return reportValueError(Ctx, "AddressV", Sampler.AddressV);
+
+ if (!llvm::hlsl::rootsig::verifyAddress(Sampler.AddressW))
+ return reportValueError(Ctx, "AddressW", Sampler.AddressW);
+
+ if (!llvm::hlsl::rootsig::verifyMipLODBias(Sampler.MipLODBias))
+ return reportValueError(Ctx, "MipLODBias", Sampler.MipLODBias);
+
+ if (!llvm::hlsl::rootsig::verifyMaxAnisotropy(Sampler.MaxAnisotropy))
+ return reportValueError(Ctx, "MaxAnisotropy", Sampler.MaxAnisotropy);
+
+ if (!llvm::hlsl::rootsig::verifyComparisonFunc(Sampler.ComparisonFunc))
+ return reportValueError(Ctx, "ComparisonFunc", Sampler.ComparisonFunc);
+
+ if (!llvm::hlsl::rootsig::verifyBorderColor(Sampler.BorderColor))
+ return reportValueError(Ctx, "BorderColor", Sampler.BorderColor);
+
+ if (!llvm::hlsl::rootsig::verifyLOD(Sampler.MinLOD))
+ return reportValueError(Ctx, "MinLOD", Sampler.MinLOD);
+
+ if (!llvm::hlsl::rootsig::verifyLOD(Sampler.MaxLOD))
+ return reportValueError(Ctx, "MaxLOD", Sampler.MaxLOD);
+
+ if (!llvm::hlsl::rootsig::verifyRegisterValue(Sampler.ShaderRegister))
+ return reportValueError(Ctx, "ShaderRegister", Sampler.ShaderRegister);
+
+ if (!llvm::hlsl::rootsig::verifyRegisterSpace(Sampler.RegisterSpace))
+ return reportValueError(Ctx, "RegisterSpace", Sampler.RegisterSpace);
+
+ if (!dxbc::isValidShaderVisibility(Sampler.ShaderVisibility))
+ return reportValueError(Ctx, "ShaderVisibility",
+ Sampler.ShaderVisibility);
+ }
+
+ return false;
+}
+
+bool MetadataParser::ParseRootSignature(LLVMContext *Ctx,
+ mcdxbc::RootSignatureDesc &RSD) {
+ bool HasError = false;
+
+ // Loop through the Root Elements of the root signature.
+ for (const auto &Operand : Root->operands()) {
+ MDNode *Element = dyn_cast<MDNode>(Operand);
+ if (Element == nullptr)
+ return reportError(Ctx, "Missing Root Element Metadata Node.");
+
+ HasError = HasError || parseRootSignatureElement(Ctx, RSD, Element) ||
+ validateRootSignature(Ctx, RSD);
+ }
+
+ return HasError;
+}
} // namespace rootsig
} // namespace hlsl
} // namespace llvm
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 28ed1e5..7159107 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1450,6 +1450,7 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
.Case("popc.ll", true)
.Case("h2f", true)
.Case("swap.lo.hi.b64", true)
+ .Case("tanh.approx.f32", true)
.Default(false);
if (Expand) {
@@ -2543,6 +2544,12 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI,
MDNode *MD = MDNode::get(Builder.getContext(), {});
LD->setMetadata(LLVMContext::MD_invariant_load, MD);
return LD;
+ } else if (Name == "tanh.approx.f32") {
+ // nvvm.tanh.approx.f32 -> afn llvm.tanh.f32
+ FastMathFlags FMF;
+ FMF.setApproxFunc();
+ Rep = Builder.CreateUnaryIntrinsic(Intrinsic::tanh, CI->getArgOperand(0),
+ FMF);
} else if (Name == "barrier0" || Name == "barrier.n" || Name == "bar.sync") {
Value *Arg =
Name.ends_with('0') ? Builder.getInt32(0) : CI->getArgOperand(0);
diff --git a/llvm/lib/IR/DiagnosticInfo.cpp b/llvm/lib/IR/DiagnosticInfo.cpp
index b94dcac..4f37624 100644
--- a/llvm/lib/IR/DiagnosticInfo.cpp
+++ b/llvm/lib/IR/DiagnosticInfo.cpp
@@ -81,6 +81,10 @@ void DiagnosticInfoInlineAsm::print(DiagnosticPrinter &DP) const {
DP << " at line " << getLocCookie();
}
+void DiagnosticInfoLegalizationFailure::print(DiagnosticPrinter &DP) const {
+ DP << getLocationStr() << ": " << getMsgStr();
+}
+
DiagnosticInfoRegAllocFailure::DiagnosticInfoRegAllocFailure(
const Twine &MsgStr, const Function &Fn, const DiagnosticLocation &DL,
DiagnosticSeverity Severity)
diff --git a/llvm/lib/IR/Type.cpp b/llvm/lib/IR/Type.cpp
index 5e1bf28..9c34662 100644
--- a/llvm/lib/IR/Type.cpp
+++ b/llvm/lib/IR/Type.cpp
@@ -304,14 +304,12 @@ IntegerType *Type::getIntNTy(LLVMContext &C, unsigned N) {
Type *Type::getWasm_ExternrefTy(LLVMContext &C) {
// opaque pointer in addrspace(10)
- static PointerType *Ty = PointerType::get(C, 10);
- return Ty;
+ return PointerType::get(C, 10);
}
Type *Type::getWasm_FuncrefTy(LLVMContext &C) {
// opaque pointer in addrspace(20)
- static PointerType *Ty = PointerType::get(C, 20);
- return Ty;
+ return PointerType::get(C, 20);
}
//===----------------------------------------------------------------------===//
@@ -324,12 +322,12 @@ IntegerType *IntegerType::get(LLVMContext &C, unsigned NumBits) {
// Check for the built-in integer types
switch (NumBits) {
- case 1: return cast<IntegerType>(Type::getInt1Ty(C));
- case 8: return cast<IntegerType>(Type::getInt8Ty(C));
- case 16: return cast<IntegerType>(Type::getInt16Ty(C));
- case 32: return cast<IntegerType>(Type::getInt32Ty(C));
- case 64: return cast<IntegerType>(Type::getInt64Ty(C));
- case 128: return cast<IntegerType>(Type::getInt128Ty(C));
+ case 1: return Type::getInt1Ty(C);
+ case 8: return Type::getInt8Ty(C);
+ case 16: return Type::getInt16Ty(C);
+ case 32: return Type::getInt32Ty(C);
+ case 64: return Type::getInt64Ty(C);
+ case 128: return Type::getInt128Ty(C);
default:
break;
}
diff --git a/llvm/lib/MC/CMakeLists.txt b/llvm/lib/MC/CMakeLists.txt
index d662c42..18a85b3 100644
--- a/llvm/lib/MC/CMakeLists.txt
+++ b/llvm/lib/MC/CMakeLists.txt
@@ -43,13 +43,7 @@ add_llvm_component_library(LLVMMC
MCRegisterInfo.cpp
MCSchedule.cpp
MCSection.cpp
- MCSectionCOFF.cpp
- MCSectionDXContainer.cpp
- MCSectionELF.cpp
- MCSectionGOFF.cpp
MCSectionMachO.cpp
- MCSectionWasm.cpp
- MCSectionXCOFF.cpp
MCStreamer.cpp
MCSPIRVStreamer.cpp
MCSubtargetInfo.cpp
diff --git a/llvm/lib/MC/ELFObjectWriter.cpp b/llvm/lib/MC/ELFObjectWriter.cpp
index 9f52b3e..ae8dffc 100644
--- a/llvm/lib/MC/ELFObjectWriter.cpp
+++ b/llvm/lib/MC/ELFObjectWriter.cpp
@@ -559,20 +559,7 @@ void ELFWriter::computeSymbolTable(const RevGroupMapTy &RevGroupMap) {
} else {
const MCSectionELF &Section =
static_cast<const MCSectionELF &>(Symbol.getSection());
-
- // We may end up with a situation when section symbol is technically
- // defined, but should not be. That happens because we explicitly
- // pre-create few .debug_* sections to have accessors.
- // And if these sections were not really defined in the code, but were
- // referenced, we simply error out.
- if (!Section.isRegistered()) {
- assert(static_cast<const MCSymbolELF &>(Symbol).getType() ==
- ELF::STT_SECTION);
- Ctx.reportError(SMLoc(),
- "Undefined section reference: " + Symbol.getName());
- continue;
- }
-
+ assert(Section.isRegistered());
if (Mode == NonDwoOnly && isDwoSection(Section))
continue;
MSD.SectionIndex = Section.getOrdinal();
@@ -1100,7 +1087,8 @@ uint64_t ELFWriter::writeObject() {
// Remember the offset into the file for this section.
const uint64_t SecStart = align(RelSection->getAlign());
- writeRelocations(cast<MCSectionELF>(*RelSection->getLinkedToSection()));
+ writeRelocations(
+ static_cast<const MCSectionELF &>(*RelSection->getLinkedToSection()));
uint64_t SecEnd = W.OS.tell();
RelSection->setOffsets(SecStart, SecEnd);
@@ -1273,7 +1261,7 @@ bool ELFObjectWriter::useSectionSymbol(const MCValue &Val,
// that it pointed to another string and subtracting 42 at runtime will
// produce the wrong value.
if (Sym->isInSection()) {
- auto &Sec = cast<MCSectionELF>(Sym->getSection());
+ auto &Sec = static_cast<const MCSectionELF &>(Sym->getSection());
unsigned Flags = Sec.getFlags();
if (Flags & ELF::SHF_MERGE) {
if (C != 0)
@@ -1325,13 +1313,14 @@ bool ELFObjectWriter::checkRelocation(SMLoc Loc, const MCSectionELF *From,
void ELFObjectWriter::recordRelocation(const MCFragment &F,
const MCFixup &Fixup, MCValue Target,
uint64_t &FixedValue) {
- const MCSectionELF &Section = cast<MCSectionELF>(*F.getParent());
+ auto &Section = static_cast<const MCSectionELF &>(*F.getParent());
MCContext &Ctx = getContext();
const auto *SymA = cast_or_null<MCSymbolELF>(Target.getAddSym());
- const MCSectionELF *SecA = (SymA && SymA->isInSection())
- ? cast<MCSectionELF>(&SymA->getSection())
- : nullptr;
+ const MCSectionELF *SecA =
+ (SymA && SymA->isInSection())
+ ? static_cast<const MCSectionELF *>(&SymA->getSection())
+ : nullptr;
if (DwoOS && !checkRelocation(Fixup.getLoc(), &Section, SecA))
return;
diff --git a/llvm/lib/MC/GOFFObjectWriter.cpp b/llvm/lib/MC/GOFFObjectWriter.cpp
index 1871f5f..88188f3 100644
--- a/llvm/lib/MC/GOFFObjectWriter.cpp
+++ b/llvm/lib/MC/GOFFObjectWriter.cpp
@@ -336,7 +336,7 @@ void GOFFWriter::defineSymbols() {
unsigned Ordinal = 0;
// Process all sections.
for (MCSection &S : Asm) {
- auto &Section = cast<MCSectionGOFF>(S);
+ auto &Section = static_cast<MCSectionGOFF &>(S);
Section.setOrdinal(++Ordinal);
defineSectionSymbols(Section);
}
diff --git a/llvm/lib/MC/MCAsmInfoCOFF.cpp b/llvm/lib/MC/MCAsmInfoCOFF.cpp
index 0b8781c..54717df 100644
--- a/llvm/lib/MC/MCAsmInfoCOFF.cpp
+++ b/llvm/lib/MC/MCAsmInfoCOFF.cpp
@@ -12,7 +12,13 @@
//===----------------------------------------------------------------------===//
#include "llvm/MC/MCAsmInfoCOFF.h"
+#include "llvm/BinaryFormat/COFF.h"
#include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
using namespace llvm;
@@ -49,6 +55,10 @@ MCAsmInfoCOFF::MCAsmInfoCOFF() {
HasCOFFComdatConstants = true;
}
+bool MCAsmInfoCOFF::useCodeAlign(const MCSection &Sec) const {
+ return Sec.isText();
+}
+
void MCAsmInfoMicrosoft::anchor() {}
MCAsmInfoMicrosoft::MCAsmInfoMicrosoft() = default;
@@ -64,3 +74,101 @@ MCAsmInfoGNUCOFF::MCAsmInfoGNUCOFF() {
// We don't create constants in comdat sections for MinGW.
HasCOFFComdatConstants = false;
}
+
+bool MCSectionCOFF::shouldOmitSectionDirective(StringRef Name) const {
+ if (COMDATSymbol || isUnique())
+ return false;
+
+ // FIXME: Does .section .bss/.data/.text work everywhere??
+ if (Name == ".text" || Name == ".data" || Name == ".bss")
+ return true;
+
+ return false;
+}
+
+void MCSectionCOFF::setSelection(int Selection) const {
+ assert(Selection != 0 && "invalid COMDAT selection type");
+ this->Selection = Selection;
+ Characteristics |= COFF::IMAGE_SCN_LNK_COMDAT;
+}
+
+void MCAsmInfoCOFF::printSwitchToSection(const MCSection &Section, uint32_t,
+ const Triple &T,
+ raw_ostream &OS) const {
+ auto &Sec = static_cast<const MCSectionCOFF &>(Section);
+ // standard sections don't require the '.section'
+ if (Sec.shouldOmitSectionDirective(Sec.getName())) {
+ OS << '\t' << Sec.getName() << '\n';
+ return;
+ }
+
+ OS << "\t.section\t" << Sec.getName() << ",\"";
+ if (Sec.getCharacteristics() & COFF::IMAGE_SCN_CNT_INITIALIZED_DATA)
+ OS << 'd';
+ if (Sec.getCharacteristics() & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA)
+ OS << 'b';
+ if (Sec.getCharacteristics() & COFF::IMAGE_SCN_MEM_EXECUTE)
+ OS << 'x';
+ if (Sec.getCharacteristics() & COFF::IMAGE_SCN_MEM_WRITE)
+ OS << 'w';
+ else if (Sec.getCharacteristics() & COFF::IMAGE_SCN_MEM_READ)
+ OS << 'r';
+ else
+ OS << 'y';
+ if (Sec.getCharacteristics() & COFF::IMAGE_SCN_LNK_REMOVE)
+ OS << 'n';
+ if (Sec.getCharacteristics() & COFF::IMAGE_SCN_MEM_SHARED)
+ OS << 's';
+ if ((Sec.getCharacteristics() & COFF::IMAGE_SCN_MEM_DISCARDABLE) &&
+ !Sec.isImplicitlyDiscardable(Sec.getName()))
+ OS << 'D';
+ if (Sec.getCharacteristics() & COFF::IMAGE_SCN_LNK_INFO)
+ OS << 'i';
+ OS << '"';
+
+ // unique should be tail of .section directive.
+ if (Sec.isUnique() && !Sec.COMDATSymbol)
+ OS << ",unique," << Sec.UniqueID;
+
+ if (Sec.getCharacteristics() & COFF::IMAGE_SCN_LNK_COMDAT) {
+ if (Sec.COMDATSymbol)
+ OS << ",";
+ else
+ OS << "\n\t.linkonce\t";
+ switch (Sec.Selection) {
+ case COFF::IMAGE_COMDAT_SELECT_NODUPLICATES:
+ OS << "one_only";
+ break;
+ case COFF::IMAGE_COMDAT_SELECT_ANY:
+ OS << "discard";
+ break;
+ case COFF::IMAGE_COMDAT_SELECT_SAME_SIZE:
+ OS << "same_size";
+ break;
+ case COFF::IMAGE_COMDAT_SELECT_EXACT_MATCH:
+ OS << "same_contents";
+ break;
+ case COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE:
+ OS << "associative";
+ break;
+ case COFF::IMAGE_COMDAT_SELECT_LARGEST:
+ OS << "largest";
+ break;
+ case COFF::IMAGE_COMDAT_SELECT_NEWEST:
+ OS << "newest";
+ break;
+ default:
+ assert(false && "unsupported COFF selection type");
+ break;
+ }
+ if (Sec.COMDATSymbol) {
+ OS << ",";
+ Sec.COMDATSymbol->print(OS, this);
+ }
+ }
+
+ if (Sec.isUnique() && Sec.COMDATSymbol)
+ OS << ",unique," << Sec.UniqueID;
+
+ OS << '\n';
+}
diff --git a/llvm/lib/MC/MCAsmInfoDarwin.cpp b/llvm/lib/MC/MCAsmInfoDarwin.cpp
index 9cba775..e156fa0 100644
--- a/llvm/lib/MC/MCAsmInfoDarwin.cpp
+++ b/llvm/lib/MC/MCAsmInfoDarwin.cpp
@@ -85,3 +85,8 @@ MCAsmInfoDarwin::MCAsmInfoDarwin() {
DwarfUsesRelocationsAcrossSections = false;
SetDirectiveSuppressesReloc = true;
}
+
+bool MCAsmInfoDarwin::useCodeAlign(const MCSection &Sec) const {
+ return static_cast<const MCSectionMachO &>(Sec).hasAttribute(
+ MachO::S_ATTR_PURE_INSTRUCTIONS);
+}
diff --git a/llvm/lib/MC/MCAsmInfoELF.cpp b/llvm/lib/MC/MCAsmInfoELF.cpp
index 7eb89ef..cdae9d7 100644
--- a/llvm/lib/MC/MCAsmInfoELF.cpp
+++ b/llvm/lib/MC/MCAsmInfoELF.cpp
@@ -12,9 +12,16 @@
//===----------------------------------------------------------------------===//
#include "llvm/MC/MCAsmInfoELF.h"
+#include "llvm/ADT/Twine.h"
#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/TargetParser/Triple.h"
+#include <cassert>
using namespace llvm;
@@ -28,9 +35,198 @@ MCSection *MCAsmInfoELF::getNonexecutableStackSection(MCContext &Ctx) const {
return Ctx.getELFSection(".note.GNU-stack", ELF::SHT_PROGBITS, 0);
}
+bool MCAsmInfoELF::useCodeAlign(const MCSection &Sec) const {
+ return static_cast<const MCSectionELF &>(Sec).getFlags() & ELF::SHF_EXECINSTR;
+}
+
MCAsmInfoELF::MCAsmInfoELF() {
HasIdentDirective = true;
WeakRefDirective = "\t.weak\t";
PrivateGlobalPrefix = ".L";
PrivateLabelPrefix = ".L";
}
+
+static void printName(raw_ostream &OS, StringRef Name) {
+ if (Name.find_first_not_of("0123456789_."
+ "abcdefghijklmnopqrstuvwxyz"
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ") == Name.npos) {
+ OS << Name;
+ return;
+ }
+ OS << '"';
+ for (const char *B = Name.begin(), *E = Name.end(); B < E; ++B) {
+ if (*B == '"') // Unquoted "
+ OS << "\\\"";
+ else if (*B != '\\') // Neither " or backslash
+ OS << *B;
+ else if (B + 1 == E) // Trailing backslash
+ OS << "\\\\";
+ else {
+ OS << B[0] << B[1]; // Quoted character
+ ++B;
+ }
+ }
+ OS << '"';
+}
+
+void MCAsmInfoELF::printSwitchToSection(const MCSection &Section,
+ uint32_t Subsection, const Triple &T,
+ raw_ostream &OS) const {
+ auto &Sec = static_cast<const MCSectionELF &>(Section);
+ if (!Sec.isUnique() && shouldOmitSectionDirective(Sec.getName())) {
+ OS << '\t' << Sec.getName();
+ if (Subsection)
+ OS << '\t' << Subsection;
+ OS << '\n';
+ return;
+ }
+
+ OS << "\t.section\t";
+ printName(OS, Sec.getName());
+
+ // Handle the weird solaris syntax if desired.
+ if (usesSunStyleELFSectionSwitchSyntax() && !(Sec.Flags & ELF::SHF_MERGE)) {
+ if (Sec.Flags & ELF::SHF_ALLOC)
+ OS << ",#alloc";
+ if (Sec.Flags & ELF::SHF_EXECINSTR)
+ OS << ",#execinstr";
+ if (Sec.Flags & ELF::SHF_WRITE)
+ OS << ",#write";
+ if (Sec.Flags & ELF::SHF_EXCLUDE)
+ OS << ",#exclude";
+ if (Sec.Flags & ELF::SHF_TLS)
+ OS << ",#tls";
+ OS << '\n';
+ return;
+ }
+
+ OS << ",\"";
+ if (Sec.Flags & ELF::SHF_ALLOC)
+ OS << 'a';
+ if (Sec.Flags & ELF::SHF_EXCLUDE)
+ OS << 'e';
+ if (Sec.Flags & ELF::SHF_EXECINSTR)
+ OS << 'x';
+ if (Sec.Flags & ELF::SHF_WRITE)
+ OS << 'w';
+ if (Sec.Flags & ELF::SHF_MERGE)
+ OS << 'M';
+ if (Sec.Flags & ELF::SHF_STRINGS)
+ OS << 'S';
+ if (Sec.Flags & ELF::SHF_TLS)
+ OS << 'T';
+ if (Sec.Flags & ELF::SHF_LINK_ORDER)
+ OS << 'o';
+ if (Sec.Flags & ELF::SHF_GROUP)
+ OS << 'G';
+ if (Sec.Flags & ELF::SHF_GNU_RETAIN)
+ OS << 'R';
+
+ // If there are os-specific flags, print them.
+ if (T.isOSSolaris())
+ if (Sec.Flags & ELF::SHF_SUNW_NODISCARD)
+ OS << 'R';
+
+ // If there are tarSec.get-specific flags, print them.
+ Triple::ArchType Arch = T.getArch();
+ if (Arch == Triple::xcore) {
+ if (Sec.Flags & ELF::XCORE_SHF_CP_SECTION)
+ OS << 'c';
+ if (Sec.Flags & ELF::XCORE_SHF_DP_SECTION)
+ OS << 'd';
+ } else if (T.isARM() || T.isThumb()) {
+ if (Sec.Flags & ELF::SHF_ARM_PURECODE)
+ OS << 'y';
+ } else if (T.isAArch64()) {
+ if (Sec.Flags & ELF::SHF_AARCH64_PURECODE)
+ OS << 'y';
+ } else if (Arch == Triple::hexagon) {
+ if (Sec.Flags & ELF::SHF_HEX_GPREL)
+ OS << 's';
+ } else if (Arch == Triple::x86_64) {
+ if (Sec.Flags & ELF::SHF_X86_64_LARGE)
+ OS << 'l';
+ }
+
+ OS << '"';
+
+ OS << ',';
+
+ // If comment string is '@', e.g. as on ARM - use '%' instead
+ if (getCommentString()[0] == '@')
+ OS << '%';
+ else
+ OS << '@';
+
+ if (Sec.Type == ELF::SHT_INIT_ARRAY)
+ OS << "init_array";
+ else if (Sec.Type == ELF::SHT_FINI_ARRAY)
+ OS << "fini_array";
+ else if (Sec.Type == ELF::SHT_PREINIT_ARRAY)
+ OS << "preinit_array";
+ else if (Sec.Type == ELF::SHT_NOBITS)
+ OS << "nobits";
+ else if (Sec.Type == ELF::SHT_NOTE)
+ OS << "note";
+ else if (Sec.Type == ELF::SHT_PROGBITS)
+ OS << "progbits";
+ else if (Sec.Type == ELF::SHT_X86_64_UNWIND)
+ OS << "unwind";
+ else if (Sec.Type == ELF::SHT_MIPS_DWARF)
+ // Print hex value of the flag while we do not have
+ // any standard symbolic representation of the flag.
+ OS << "0x7000001e";
+ else if (Sec.Type == ELF::SHT_LLVM_ODRTAB)
+ OS << "llvm_odrtab";
+ else if (Sec.Type == ELF::SHT_LLVM_LINKER_OPTIONS)
+ OS << "llvm_linker_options";
+ else if (Sec.Type == ELF::SHT_LLVM_CALL_GRAPH_PROFILE)
+ OS << "llvm_call_graph_profile";
+ else if (Sec.Type == ELF::SHT_LLVM_DEPENDENT_LIBRARIES)
+ OS << "llvm_dependent_libraries";
+ else if (Sec.Type == ELF::SHT_LLVM_SYMPART)
+ OS << "llvm_sympart";
+ else if (Sec.Type == ELF::SHT_LLVM_BB_ADDR_MAP)
+ OS << "llvm_bb_addr_map";
+ else if (Sec.Type == ELF::SHT_LLVM_OFFLOADING)
+ OS << "llvm_offloading";
+ else if (Sec.Type == ELF::SHT_LLVM_LTO)
+ OS << "llvm_lto";
+ else if (Sec.Type == ELF::SHT_LLVM_JT_SIZES)
+ OS << "llvm_jt_sizes";
+ else if (Sec.Type == ELF::SHT_LLVM_CFI_JUMP_TABLE)
+ OS << "llvm_cfi_jump_table";
+ else
+ OS << "0x" << Twine::utohexstr(Sec.Type);
+
+ if (Sec.EntrySize) {
+ assert((Sec.Flags & ELF::SHF_MERGE) ||
+ Sec.Type == ELF::SHT_LLVM_CFI_JUMP_TABLE);
+ OS << "," << Sec.EntrySize;
+ }
+
+ if (Sec.Flags & ELF::SHF_LINK_ORDER) {
+ OS << ",";
+ if (Sec.LinkedToSym)
+ printName(OS, Sec.LinkedToSym->getName());
+ else
+ OS << '0';
+ }
+
+ if (Sec.Flags & ELF::SHF_GROUP) {
+ OS << ",";
+ printName(OS, Sec.Group.getPointer()->getName());
+ if (Sec.isComdat())
+ OS << ",comdat";
+ }
+
+ if (Sec.isUnique())
+ OS << ",unique," << Sec.UniqueID;
+
+ OS << '\n';
+
+ if (Subsection) {
+ OS << "\t.subsection\t" << Subsection;
+ OS << '\n';
+ }
+}
diff --git a/llvm/lib/MC/MCAsmInfoGOFF.cpp b/llvm/lib/MC/MCAsmInfoGOFF.cpp
index 3c81a46..0a5d1927 100644
--- a/llvm/lib/MC/MCAsmInfoGOFF.cpp
+++ b/llvm/lib/MC/MCAsmInfoGOFF.cpp
@@ -13,11 +13,12 @@
//===----------------------------------------------------------------------===//
#include "llvm/MC/MCAsmInfoGOFF.h"
+#include "llvm/BinaryFormat/GOFF.h"
+#include "llvm/MC/MCSectionGOFF.h"
+#include "llvm/Support/raw_ostream.h"
using namespace llvm;
-void MCAsmInfoGOFF::anchor() {}
-
MCAsmInfoGOFF::MCAsmInfoGOFF() {
Data64bitsDirective = "\t.quad\t";
HasDotTypeDotSizeDirective = false;
@@ -25,3 +26,136 @@ MCAsmInfoGOFF::MCAsmInfoGOFF() {
PrivateLabelPrefix = "L#";
ZeroDirective = "\t.space\t";
}
+
+static void emitCATTR(raw_ostream &OS, StringRef Name, GOFF::ESDRmode Rmode,
+ GOFF::ESDAlignment Alignment,
+ GOFF::ESDLoadingBehavior LoadBehavior,
+ GOFF::ESDExecutable Executable, bool IsReadOnly,
+ uint32_t SortKey, uint8_t FillByteValue,
+ StringRef PartName) {
+ OS << Name << " CATTR ";
+ OS << "ALIGN(" << static_cast<unsigned>(Alignment) << "),"
+ << "FILL(" << static_cast<unsigned>(FillByteValue) << ")";
+ switch (LoadBehavior) {
+ case GOFF::ESD_LB_Deferred:
+ OS << ",DEFLOAD";
+ break;
+ case GOFF::ESD_LB_NoLoad:
+ OS << ",NOLOAD";
+ break;
+ default:
+ break;
+ }
+ switch (Executable) {
+ case GOFF::ESD_EXE_CODE:
+ OS << ",EXECUTABLE";
+ break;
+ case GOFF::ESD_EXE_DATA:
+ OS << ",NOTEXECUTABLE";
+ break;
+ default:
+ break;
+ }
+ if (IsReadOnly)
+ OS << ",READONLY";
+ if (Rmode != GOFF::ESD_RMODE_None) {
+ OS << ',';
+ OS << "RMODE(";
+ switch (Rmode) {
+ case GOFF::ESD_RMODE_24:
+ OS << "24";
+ break;
+ case GOFF::ESD_RMODE_31:
+ OS << "31";
+ break;
+ case GOFF::ESD_RMODE_64:
+ OS << "64";
+ break;
+ case GOFF::ESD_RMODE_None:
+ break;
+ }
+ OS << ')';
+ }
+ if (SortKey)
+ OS << ",PRIORITY(" << SortKey << ")";
+ if (!PartName.empty())
+ OS << ",PART(" << PartName << ")";
+ OS << '\n';
+}
+
+static void emitXATTR(raw_ostream &OS, StringRef Name,
+ GOFF::ESDLinkageType Linkage,
+ GOFF::ESDExecutable Executable,
+ GOFF::ESDBindingScope BindingScope) {
+ OS << Name << " XATTR ";
+ OS << "LINKAGE(" << (Linkage == GOFF::ESD_LT_OS ? "OS" : "XPLINK") << "),";
+ if (Executable != GOFF::ESD_EXE_Unspecified)
+ OS << "REFERENCE(" << (Executable == GOFF::ESD_EXE_CODE ? "CODE" : "DATA")
+ << "),";
+ if (BindingScope != GOFF::ESD_BSC_Unspecified) {
+ OS << "SCOPE(";
+ switch (BindingScope) {
+ case GOFF::ESD_BSC_Section:
+ OS << "SECTION";
+ break;
+ case GOFF::ESD_BSC_Module:
+ OS << "MODULE";
+ break;
+ case GOFF::ESD_BSC_Library:
+ OS << "LIBRARY";
+ break;
+ case GOFF::ESD_BSC_ImportExport:
+ OS << "EXPORT";
+ break;
+ default:
+ break;
+ }
+ OS << ')';
+ }
+ OS << '\n';
+}
+
+void MCAsmInfoGOFF::printSwitchToSection(const MCSection &Section,
+ uint32_t Subsection, const Triple &T,
+ raw_ostream &OS) const {
+ auto &Sec =
+ const_cast<MCSectionGOFF &>(static_cast<const MCSectionGOFF &>(Section));
+ switch (Sec.SymbolType) {
+ case GOFF::ESD_ST_SectionDefinition: {
+ OS << Sec.getName() << " CSECT\n";
+ Sec.Emitted = true;
+ break;
+ }
+ case GOFF::ESD_ST_ElementDefinition: {
+ printSwitchToSection(*Sec.getParent(), Subsection, T, OS);
+ if (!Sec.Emitted) {
+ emitCATTR(OS, Sec.getName(), Sec.EDAttributes.Rmode,
+ Sec.EDAttributes.Alignment, Sec.EDAttributes.LoadBehavior,
+ GOFF::ESD_EXE_Unspecified, Sec.EDAttributes.IsReadOnly, 0,
+ Sec.EDAttributes.FillByteValue, StringRef());
+ Sec.Emitted = true;
+ } else
+ OS << Sec.getName() << " CATTR\n";
+ break;
+ }
+ case GOFF::ESD_ST_PartReference: {
+ MCSectionGOFF *ED = Sec.getParent();
+ printSwitchToSection(*ED->getParent(), Subsection, T, OS);
+ if (!Sec.Emitted) {
+ emitCATTR(OS, ED->getName(), ED->getEDAttributes().Rmode,
+ ED->EDAttributes.Alignment, ED->EDAttributes.LoadBehavior,
+ Sec.PRAttributes.Executable, ED->EDAttributes.IsReadOnly,
+ Sec.PRAttributes.SortKey, ED->EDAttributes.FillByteValue,
+ Sec.getName());
+ emitXATTR(OS, Sec.getName(), Sec.PRAttributes.Linkage,
+ Sec.PRAttributes.Executable, Sec.PRAttributes.BindingScope);
+ ED->Emitted = true;
+ Sec.Emitted = true;
+ } else
+ OS << ED->getName() << " CATTR PART(" << Sec.getName() << ")\n";
+ break;
+ }
+ default:
+ llvm_unreachable("Wrong section type");
+ }
+}
diff --git a/llvm/lib/MC/MCAsmInfoWasm.cpp b/llvm/lib/MC/MCAsmInfoWasm.cpp
index ce6ec7e..5e44f48 100644
--- a/llvm/lib/MC/MCAsmInfoWasm.cpp
+++ b/llvm/lib/MC/MCAsmInfoWasm.cpp
@@ -12,9 +12,11 @@
//===----------------------------------------------------------------------===//
#include "llvm/MC/MCAsmInfoWasm.h"
-using namespace llvm;
+#include "llvm/MC/MCSectionWasm.h"
+#include "llvm/MC/MCSymbolWasm.h"
+#include "llvm/Support/raw_ostream.h"
-void MCAsmInfoWasm::anchor() {}
+using namespace llvm;
MCAsmInfoWasm::MCAsmInfoWasm() {
HasIdentDirective = true;
@@ -23,3 +25,80 @@ MCAsmInfoWasm::MCAsmInfoWasm() {
PrivateGlobalPrefix = ".L";
PrivateLabelPrefix = ".L";
}
+
+static void printName(raw_ostream &OS, StringRef Name) {
+ if (Name.find_first_not_of("0123456789_."
+ "abcdefghijklmnopqrstuvwxyz"
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ") == Name.npos) {
+ OS << Name;
+ return;
+ }
+ OS << '"';
+ for (const char *B = Name.begin(), *E = Name.end(); B < E; ++B) {
+ if (*B == '"') // Unquoted "
+ OS << "\\\"";
+ else if (*B != '\\') // Neither " or backslash
+ OS << *B;
+ else if (B + 1 == E) // Trailing backslash
+ OS << "\\\\";
+ else {
+ OS << B[0] << B[1]; // Quoted character
+ ++B;
+ }
+ }
+ OS << '"';
+}
+
+void MCAsmInfoWasm::printSwitchToSection(const MCSection &Section,
+ uint32_t Subsection, const Triple &T,
+ raw_ostream &OS) const {
+ auto &Sec = static_cast<const MCSectionWasm &>(Section);
+ if (shouldOmitSectionDirective(Sec.getName())) {
+ OS << '\t' << Sec.getName();
+ if (Subsection)
+ OS << '\t' << Subsection;
+ OS << '\n';
+ return;
+ }
+
+ OS << "\t.section\t";
+ printName(OS, Sec.getName());
+ OS << ",\"";
+
+ if (Sec.IsPassive)
+ OS << 'p';
+ if (Sec.Group)
+ OS << 'G';
+ if (Sec.SegmentFlags & wasm::WASM_SEG_FLAG_STRINGS)
+ OS << 'S';
+ if (Sec.SegmentFlags & wasm::WASM_SEG_FLAG_TLS)
+ OS << 'T';
+ if (Sec.SegmentFlags & wasm::WASM_SEG_FLAG_RETAIN)
+ OS << 'R';
+
+ OS << '"';
+
+ OS << ',';
+
+ // If comment string is '@', e.g. as on ARM - use '%' instead
+ if (getCommentString()[0] == '@')
+ OS << '%';
+ else
+ OS << '@';
+
+ // TODO: Print section type.
+
+ if (Sec.Group) {
+ OS << ",";
+ printName(OS, Sec.Group->getName());
+ OS << ",comdat";
+ }
+
+ if (Sec.isUnique())
+ OS << ",unique," << Sec.UniqueID;
+
+ OS << '\n';
+
+ if (Subsection)
+ OS << "\t.subsection\t" << Subsection << '\n';
+}
diff --git a/llvm/lib/MC/MCAsmInfoXCOFF.cpp b/llvm/lib/MC/MCAsmInfoXCOFF.cpp
index 6ef11ba..0403b44 100644
--- a/llvm/lib/MC/MCAsmInfoXCOFF.cpp
+++ b/llvm/lib/MC/MCAsmInfoXCOFF.cpp
@@ -8,7 +8,11 @@
#include "llvm/MC/MCAsmInfoXCOFF.h"
#include "llvm/ADT/StringExtras.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCSectionXCOFF.h"
#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
using namespace llvm;
@@ -16,8 +20,6 @@ namespace llvm {
extern cl::opt<cl::boolOrDefault> UseLEB128Directives;
}
-void MCAsmInfoXCOFF::anchor() {}
-
MCAsmInfoXCOFF::MCAsmInfoXCOFF() {
IsAIX = true;
IsLittleEndian = false;
@@ -56,3 +58,121 @@ bool MCAsmInfoXCOFF::isAcceptableChar(char C) const {
// any combination of these.
return isAlnum(C) || C == '_' || C == '.';
}
+
+bool MCAsmInfoXCOFF::useCodeAlign(const MCSection &Sec) const {
+ return static_cast<const MCSectionXCOFF &>(Sec).getKind().isText();
+}
+
+MCSectionXCOFF::~MCSectionXCOFF() = default;
+
+void MCSectionXCOFF::printCsectDirective(raw_ostream &OS) const {
+ OS << "\t.csect " << QualName->getName() << "," << Log2(getAlign()) << '\n';
+}
+
+void MCAsmInfoXCOFF::printSwitchToSection(const MCSection &Section, uint32_t,
+ const Triple &T,
+ raw_ostream &OS) const {
+ auto &Sec = static_cast<const MCSectionXCOFF &>(Section);
+ if (Sec.getKind().isText()) {
+ if (Sec.getMappingClass() != XCOFF::XMC_PR)
+ report_fatal_error("Unhandled storage-mapping class for .text csect");
+
+ Sec.printCsectDirective(OS);
+ return;
+ }
+
+ if (Sec.getKind().isReadOnly()) {
+ if (Sec.getMappingClass() != XCOFF::XMC_RO &&
+ Sec.getMappingClass() != XCOFF::XMC_TD)
+ report_fatal_error("Unhandled storage-mapping class for .rodata csect.");
+ Sec.printCsectDirective(OS);
+ return;
+ }
+
+ if (Sec.getKind().isReadOnlyWithRel()) {
+ if (Sec.getMappingClass() != XCOFF::XMC_RW &&
+ Sec.getMappingClass() != XCOFF::XMC_RO &&
+ Sec.getMappingClass() != XCOFF::XMC_TD)
+ report_fatal_error(
+ "Unexepected storage-mapping class for ReadOnlyWithRel kind");
+ Sec.printCsectDirective(OS);
+ return;
+ }
+
+ // Initialized TLS data.
+ if (Sec.getKind().isThreadData()) {
+ // We only expect XMC_TL here for initialized TLS data.
+ if (Sec.getMappingClass() != XCOFF::XMC_TL)
+ report_fatal_error("Unhandled storage-mapping class for .tdata csect.");
+ Sec.printCsectDirective(OS);
+ return;
+ }
+
+ if (Sec.getKind().isData()) {
+ switch (Sec.getMappingClass()) {
+ case XCOFF::XMC_RW:
+ case XCOFF::XMC_DS:
+ case XCOFF::XMC_TD:
+ Sec.printCsectDirective(OS);
+ break;
+ case XCOFF::XMC_TC:
+ case XCOFF::XMC_TE:
+ break;
+ case XCOFF::XMC_TC0:
+ OS << "\t.toc\n";
+ break;
+ default:
+ report_fatal_error("Unhandled storage-mapping class for .data csect.");
+ }
+ return;
+ }
+
+ if (Sec.isCsect() && Sec.getMappingClass() == XCOFF::XMC_TD) {
+ // Common csect type (uninitialized storage) does not have to print
+ // csect directive for section switching unless it is local.
+ if (Sec.getKind().isCommon() && !Sec.getKind().isBSSLocal())
+ return;
+
+ assert(Sec.getKind().isBSS() && "Unexpected section kind for toc-data");
+ Sec.printCsectDirective(OS);
+ return;
+ }
+ // Common csect type (uninitialized storage) does not have to print csect
+ // directive for section switching.
+ if (Sec.isCsect() && Sec.getCSectType() == XCOFF::XTY_CM) {
+ assert((Sec.getMappingClass() == XCOFF::XMC_RW ||
+ Sec.getMappingClass() == XCOFF::XMC_BS ||
+ Sec.getMappingClass() == XCOFF::XMC_UL) &&
+ "Generated a storage-mapping class for a common/bss/tbss csect we "
+ "don't "
+ "understand how to switch to.");
+ // Common symbols and local zero-initialized symbols for TLS and Non-TLS are
+ // eligible for .bss/.tbss csect, getKind().isThreadBSS() is used to
+ // cover TLS common and zero-initialized local symbols since linkage type
+ // (in the GlobalVariable) is not accessible in this class.
+ assert((Sec.getKind().isBSSLocal() || Sec.getKind().isCommon() ||
+ Sec.getKind().isThreadBSS()) &&
+ "wrong symbol type for .bss/.tbss csect");
+ // Don't have to print a directive for switching to section for commons
+ // and zero-initialized TLS data. The '.comm' and '.lcomm' directives of the
+ // variable will create the needed csect.
+ return;
+ }
+
+ // Zero-initialized TLS data with weak or external linkage are not eligible to
+ // be put into common csect.
+ if (Sec.getKind().isThreadBSS()) {
+ Sec.printCsectDirective(OS);
+ return;
+ }
+
+ // XCOFF debug sections.
+ if (Sec.getKind().isMetadata() && Sec.isDwarfSect()) {
+ OS << "\n\t.dwsect " << format("0x%" PRIx32, *Sec.getDwarfSubtypeFlags())
+ << '\n';
+ OS << Sec.getName() << ':' << '\n';
+ return;
+ }
+
+ report_fatal_error("Printing for this SectionKind is unimplemented.");
+}
diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp
index 67c53e0..da51da4 100644
--- a/llvm/lib/MC/MCAsmStreamer.cpp
+++ b/llvm/lib/MC/MCAsmStreamer.cpp
@@ -345,7 +345,7 @@ public:
void emitIdent(StringRef IdentString) override;
void emitCFIBKeyFrame() override;
void emitCFIMTETaggedFrame() override;
- void emitCFISections(bool EH, bool Debug) override;
+ void emitCFISections(bool EH, bool Debug, bool SFrame) override;
void emitCFIDefCfa(int64_t Register, int64_t Offset, SMLoc Loc) override;
void emitCFIDefCfaOffset(int64_t Offset, SMLoc Loc) override;
void emitCFIDefCfaRegister(int64_t Register, SMLoc Loc) override;
@@ -532,8 +532,8 @@ void MCAsmStreamer::switchSection(MCSection *Section, uint32_t Subsection) {
if (MCTargetStreamer *TS = getTargetStreamer()) {
TS->changeSection(Cur.first, Section, Subsection, OS);
} else {
- Section->printSwitchToSection(*MAI, getContext().getTargetTriple(), OS,
- Subsection);
+ MAI->printSwitchToSection(*Section, Subsection,
+ getContext().getTargetTriple(), OS);
}
}
MCStreamer::switchSection(Section, Subsection);
@@ -543,7 +543,7 @@ bool MCAsmStreamer::popSection() {
if (!MCStreamer::popSection())
return false;
auto [Sec, Subsec] = getCurrentSection();
- Sec->printSwitchToSection(*MAI, getContext().getTargetTriple(), OS, Subsec);
+ MAI->printSwitchToSection(*Sec, Subsec, getContext().getTargetTriple(), OS);
return true;
}
@@ -1105,7 +1105,7 @@ void MCAsmStreamer::emitZerofill(MCSection *Section, MCSymbol *Symbol,
// Note: a .zerofill directive does not switch sections.
OS << ".zerofill ";
- assert(Section->getVariant() == MCSection::SV_MachO &&
+ assert(getContext().getObjectFileType() == MCContext::IsMachO &&
".zerofill is a Mach-O specific directive");
// This is a mach-o specific directive.
@@ -1130,7 +1130,7 @@ void MCAsmStreamer::emitTBSSSymbol(MCSection *Section, MCSymbol *Symbol,
// Instead of using the Section we'll just use the shortcut.
- assert(Section->getVariant() == MCSection::SV_MachO &&
+ assert(getContext().getObjectFileType() == MCContext::IsMachO &&
".zerofill is a Mach-O specific directive");
// This is a mach-o specific directive and section.
@@ -1906,15 +1906,24 @@ void MCAsmStreamer::emitIdent(StringRef IdentString) {
EmitEOL();
}
-void MCAsmStreamer::emitCFISections(bool EH, bool Debug) {
- MCStreamer::emitCFISections(EH, Debug);
+void MCAsmStreamer::emitCFISections(bool EH, bool Debug, bool SFrame) {
+ MCStreamer::emitCFISections(EH, Debug, SFrame);
OS << "\t.cfi_sections ";
+ bool C = false;
if (EH) {
OS << ".eh_frame";
- if (Debug)
- OS << ", .debug_frame";
- } else if (Debug) {
+ C = true;
+ }
+ if (Debug) {
+ if (C)
+ OS << ", ";
OS << ".debug_frame";
+ C = true;
+ }
+ if (SFrame) {
+ if (C)
+ OS << ", ";
+ OS << ".sframe";
}
EmitEOL();
diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp
index 2b56e2a..8500fd1 100644
--- a/llvm/lib/MC/MCAssembler.cpp
+++ b/llvm/lib/MC/MCAssembler.cpp
@@ -8,7 +8,6 @@
#include "llvm/MC/MCAssembler.h"
#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
@@ -107,7 +106,6 @@ void MCAssembler::reset() {
bool MCAssembler::registerSection(MCSection &Section) {
if (Section.isRegistered())
return false;
- assert(Section.curFragList()->Head && "allocInitialFragment not called");
Sections.push_back(&Section);
Section.setIsRegistered(true);
return true;
diff --git a/llvm/lib/MC/MCContext.cpp b/llvm/lib/MC/MCContext.cpp
index 12b3fba..39bf628 100644
--- a/llvm/lib/MC/MCContext.cpp
+++ b/llvm/lib/MC/MCContext.cpp
@@ -200,16 +200,6 @@ MCInst *MCContext::createMCInst() {
return new (MCInstAllocator.Allocate()) MCInst;
}
-// Allocate the initial MCFragment for the begin symbol.
-MCFragment *MCContext::allocInitialFragment(MCSection &Sec) {
- assert(!Sec.curFragList()->Head);
- auto *F = allocFragment<MCFragment>();
- F->setParent(&Sec);
- Sec.curFragList()->Head = F;
- Sec.curFragList()->Tail = F;
- return F;
-}
-
//===----------------------------------------------------------------------===//
// Symbol Manipulation
//===----------------------------------------------------------------------===//
@@ -443,17 +433,19 @@ MCSymbol *MCContext::getDirectionalLocalSymbol(unsigned LocalLabelVal,
return getOrCreateDirectionalLocalSymbol(LocalLabelVal, Instance);
}
+// Create a section symbol, with a distinct one for each section of the same.
+// The first symbol is used for assembly code references.
template <typename Symbol>
Symbol *MCContext::getOrCreateSectionSymbol(StringRef Section) {
Symbol *R;
auto &SymEntry = getSymbolTableEntry(Section);
MCSymbol *Sym = SymEntry.second.Symbol;
- // A section symbol can not redefine regular symbols. There may be multiple
- // sections with the same name, in which case the first such section wins.
if (Sym && Sym->isDefined() &&
(!Sym->isInSection() || Sym->getSection().getBeginSymbol() != Sym))
reportError(SMLoc(), "invalid symbol redefinition");
- if (Sym && Sym->isUndefined()) {
+ // Use the symbol's index to track if it has been used as a section symbol.
+ // Set to -1 to catch potential bugs if misused as a symbol index.
+ if (Sym && Sym->getIndex() != -1u) {
R = cast<Symbol>(Sym);
} else {
SymEntry.second.Used = true;
@@ -461,6 +453,8 @@ Symbol *MCContext::getOrCreateSectionSymbol(StringRef Section) {
if (!Sym)
SymEntry.second.Symbol = R;
}
+ // Mark as section symbol.
+ R->setIndex(-1u);
return R;
}
@@ -568,7 +562,6 @@ MCSectionMachO *MCContext::getMachOSection(StringRef Segment, StringRef Section,
MCSectionMachO(Segment, Name.substr(Name.size() - Section.size()),
TypeAndAttributes, Reserved2, Kind, Begin);
R.first->second = Ret;
- allocInitialFragment(*Ret);
return Ret;
}
@@ -579,15 +572,8 @@ MCSectionELF *MCContext::createELFSectionImpl(StringRef Section, unsigned Type,
bool Comdat, unsigned UniqueID,
const MCSymbolELF *LinkedToSym) {
auto *R = getOrCreateSectionSymbol<MCSymbolELF>(Section);
- R->setBinding(ELF::STB_LOCAL);
- R->setType(ELF::STT_SECTION);
-
- auto *Ret = new (ELFAllocator.Allocate()) MCSectionELF(
+ return new (ELFAllocator.Allocate()) MCSectionELF(
Section, Type, Flags, EntrySize, Group, Comdat, UniqueID, R, LinkedToSym);
-
- auto *F = allocInitialFragment(*Ret);
- R->setFragment(F);
- return Ret;
}
MCSectionELF *
@@ -743,7 +729,6 @@ MCSectionGOFF *MCContext::getGOFFSection(SectionKind Kind, StringRef Name,
MCSectionGOFF(CachedName, Kind, IsVirtual, Attributes,
static_cast<MCSectionGOFF *>(Parent));
Iter->second = GOFFSection;
- allocInitialFragment(*GOFFSection);
return GOFFSection;
}
@@ -782,8 +767,8 @@ MCSectionCOFF *MCContext::getCOFFSection(StringRef Section,
if (Selection != COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE &&
COMDATSymbol->isDefined() &&
(!COMDATSymbol->isInSection() ||
- cast<MCSectionCOFF>(COMDATSymbol->getSection()).getCOMDATSymbol() !=
- COMDATSymbol))
+ static_cast<const MCSectionCOFF &>(COMDATSymbol->getSection())
+ .getCOMDATSymbol() != COMDATSymbol))
reportError(SMLoc(), "invalid symbol redefinition");
}
@@ -798,8 +783,7 @@ MCSectionCOFF *MCContext::getCOFFSection(StringRef Section,
MCSectionCOFF *Result = new (COFFAllocator.Allocate()) MCSectionCOFF(
CachedName, Characteristics, COMDATSymbol, Selection, UniqueID, Begin);
Iter->second = Result;
- auto *F = allocInitialFragment(*Result);
- Begin->setFragment(F);
+ Begin->setFragment(&Result->getDummyFragment());
return Result;
}
@@ -870,8 +854,6 @@ MCSectionWasm *MCContext::getWasmSection(const Twine &Section, SectionKind Kind,
MCSectionWasm(CachedName, Kind, Flags, GroupSym, UniqueID, Begin);
Entry.second = Result;
- auto *F = allocInitialFragment(*Result);
- Begin->setFragment(F);
return Result;
}
@@ -927,24 +909,11 @@ MCSectionXCOFF *MCContext::getXCOFFSection(
MultiSymbolsAllowed);
Entry.second = Result;
-
- auto *F = allocInitialFragment(*Result);
-
- // We might miss calculating the symbols difference as absolute value before
- // adding fixups when symbol_A without the fragment set is the csect itself
- // and symbol_B is in it.
- // TODO: Currently we only set the fragment for XMC_PR csects and DWARF
- // sections because we don't have other cases that hit this problem yet.
- if (IsDwarfSec || CsectProp->MappingClass == XCOFF::XMC_PR)
- QualName->setFragment(F);
-
return Result;
}
MCSectionSPIRV *MCContext::getSPIRVSection() {
MCSectionSPIRV *Result = new (SPIRVAllocator.Allocate()) MCSectionSPIRV();
-
- allocInitialFragment(*Result);
return Result;
}
@@ -964,7 +933,6 @@ MCSectionDXContainer *MCContext::getDXContainerSection(StringRef Section,
new (DXCAllocator.Allocate()) MCSectionDXContainer(Name, K, nullptr);
// The first fragment will store the header
- allocInitialFragment(*MapIt->second);
return MapIt->second;
}
diff --git a/llvm/lib/MC/MCELFStreamer.cpp b/llvm/lib/MC/MCELFStreamer.cpp
index b8cbaea5..38744a0 100644
--- a/llvm/lib/MC/MCELFStreamer.cpp
+++ b/llvm/lib/MC/MCELFStreamer.cpp
@@ -89,7 +89,9 @@ void MCELFStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
getWriter().markGnuAbi();
MCObjectStreamer::changeSection(Section, Subsection);
- Asm.registerSymbol(*Section->getBeginSymbol());
+ auto *Sym = static_cast<MCSymbolELF *>(Section->getBeginSymbol());
+ Sym->setBinding(ELF::STB_LOCAL);
+ Sym->setType(ELF::STT_SECTION);
}
void MCELFStreamer::emitWeakReference(MCSymbol *Alias, const MCSymbol *Target) {
diff --git a/llvm/lib/MC/MCFragment.cpp b/llvm/lib/MC/MCFragment.cpp
index 3c395e5..6cbdf74 100644
--- a/llvm/lib/MC/MCFragment.cpp
+++ b/llvm/lib/MC/MCFragment.cpp
@@ -35,7 +35,7 @@ MCFragment::MCFragment(FragmentType Kind, bool HasInstructions)
}
const MCSymbol *MCFragment::getAtom() const {
- return cast<MCSectionMachO>(Parent)->getAtom(LayoutOrder);
+ return static_cast<const MCSectionMachO *>(Parent)->getAtom(LayoutOrder);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/lib/MC/MCGOFFStreamer.cpp b/llvm/lib/MC/MCGOFFStreamer.cpp
index b702191..1718e2a 100644
--- a/llvm/lib/MC/MCGOFFStreamer.cpp
+++ b/llvm/lib/MC/MCGOFFStreamer.cpp
@@ -26,19 +26,15 @@ GOFFObjectWriter &MCGOFFStreamer::getWriter() {
return static_cast<GOFFObjectWriter &>(getAssembler().getWriter());
}
-// Make sure that all section are registered in the correct order.
-static void registerSectionHierarchy(MCAssembler &Asm, MCSectionGOFF *Section) {
- if (Section->isRegistered())
- return;
- if (Section->getParent())
- registerSectionHierarchy(Asm, Section->getParent());
- Asm.registerSection(*Section);
-}
-
void MCGOFFStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
- registerSectionHierarchy(getAssembler(),
- static_cast<MCSectionGOFF *>(Section));
- MCObjectStreamer::changeSection(Section, Subsection);
+ // Make sure that all section are registered in the correct order.
+ SmallVector<MCSectionGOFF *> Sections;
+ for (auto *S = static_cast<MCSectionGOFF *>(Section); S; S = S->getParent())
+ Sections.push_back(S);
+ while (!Sections.empty()) {
+ auto *S = Sections.pop_back_val();
+ MCObjectStreamer::changeSection(S, Sections.empty() ? Subsection : 0);
+ }
}
MCStreamer *llvm::createGOFFStreamer(MCContext &Context,
diff --git a/llvm/lib/MC/MCMachOStreamer.cpp b/llvm/lib/MC/MCMachOStreamer.cpp
index 8f023bd..1074669 100644
--- a/llvm/lib/MC/MCMachOStreamer.cpp
+++ b/llvm/lib/MC/MCMachOStreamer.cpp
@@ -7,7 +7,6 @@
//===----------------------------------------------------------------------===//
#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/BinaryFormat/MachO.h"
@@ -141,6 +140,8 @@ void MCMachOStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
MCSymbol *Label = getContext().createLinkerPrivateTempSymbol();
Section->setBeginSymbol(Label);
HasSectionLabel[Section] = true;
+ if (!Label->isInSection())
+ emitLabel(Label);
}
}
@@ -442,13 +443,13 @@ void MCMachOStreamer::finishImpl() {
// Set the fragment atom associations by tracking the last seen atom defining
// symbol.
for (MCSection &Sec : getAssembler()) {
- cast<MCSectionMachO>(Sec).allocAtoms();
+ static_cast<MCSectionMachO &>(Sec).allocAtoms();
const MCSymbol *CurrentAtom = nullptr;
size_t I = 0;
for (MCFragment &Frag : Sec) {
if (const MCSymbol *Symbol = DefiningSymbolMap.lookup(&Frag))
CurrentAtom = Symbol;
- cast<MCSectionMachO>(Sec).setAtom(I++, CurrentAtom);
+ static_cast<MCSectionMachO &>(Sec).setAtom(I++, CurrentAtom);
}
}
diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp
index 42f4cf4..f046552 100644
--- a/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/llvm/lib/MC/MCObjectStreamer.cpp
@@ -19,7 +19,6 @@
#include "llvm/MC/MCObjectWriter.h"
#include "llvm/MC/MCSection.h"
#include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCValue.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/SourceMgr.h"
using namespace llvm;
@@ -130,10 +129,11 @@ void MCObjectStreamer::visitUsedSymbol(const MCSymbol &Sym) {
Assembler->registerSymbol(Sym);
}
-void MCObjectStreamer::emitCFISections(bool EH, bool Debug) {
- MCStreamer::emitCFISections(EH, Debug);
+void MCObjectStreamer::emitCFISections(bool EH, bool Debug, bool SFrame) {
+ MCStreamer::emitCFISections(EH, Debug, SFrame);
EmitEHFrame = EH;
EmitDebugFrame = Debug;
+ EmitSFrame = SFrame;
}
void MCObjectStreamer::emitValueImpl(const MCExpr *Value, unsigned Size,
@@ -185,10 +185,10 @@ void MCObjectStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) {
getAssembler().registerSymbol(*Symbol);
- // If there is a current fragment, mark the symbol as pointing into it.
- // Otherwise queue the label and set its fragment pointer when we emit the
- // next fragment.
- MCFragment *F = getCurrentFragment();
+ // Set the fragment and offset. This function might be called by
+ // changeSection, when the section stack top hasn't been changed to the new
+ // section.
+ MCFragment *F = CurFrag;
Symbol->setFragment(F);
Symbol->setOffset(F->getContents().size());
@@ -247,6 +247,15 @@ void MCObjectStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
assert(Section && "Cannot switch to a null section!");
getContext().clearDwarfLocSeen();
+ // Register the section and create an initial fragment for subsection 0
+ // if `Subsection` is non-zero.
+ bool NewSec = getAssembler().registerSection(*Section);
+ MCFragment *F0 = nullptr;
+ if (NewSec && Subsection) {
+ changeSection(Section, 0);
+ F0 = CurFrag;
+ }
+
auto &Subsections = Section->Subsections;
size_t I = 0, E = Subsections.size();
while (I != E && Subsections[I].first < Subsection)
@@ -262,12 +271,13 @@ void MCObjectStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
Section->CurFragList = &Subsections[I].second;
CurFrag = Section->CurFragList->Tail;
- getAssembler().registerSection(*Section);
-}
-
-void MCObjectStreamer::switchSectionNoPrint(MCSection *Section) {
- MCStreamer::switchSectionNoPrint(Section);
- changeSection(Section, 0);
+ // Define the section symbol at subsection 0's initial fragment if required.
+ if (!NewSec)
+ return;
+ if (auto *Sym = Section->getBeginSymbol()) {
+ Sym->setFragment(Subsection ? F0 : CurFrag);
+ getAssembler().registerSymbol(*Sym);
+ }
}
void MCObjectStreamer::emitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp
index d0b6ea4..9f64a98 100644
--- a/llvm/lib/MC/MCParser/AsmParser.cpp
+++ b/llvm/lib/MC/MCParser/AsmParser.cpp
@@ -3413,7 +3413,7 @@ bool AsmParser::parseDirectiveAlign(bool IsPow2, uint8_t ValueSize) {
// Check whether we should use optimal code alignment for this .align
// directive.
- if (Section->useCodeAlign() && !HasFillExpr) {
+ if (MAI.useCodeAlign(*Section) && !HasFillExpr) {
getStreamer().emitCodeAlignment(
Align(Alignment), &getTargetParser().getSTI(), MaxBytesToFill);
} else {
@@ -4093,27 +4093,30 @@ bool AsmParser::parseDirectiveCVFPOData() {
}
/// parseDirectiveCFISections
-/// ::= .cfi_sections section [, section]
+/// ::= .cfi_sections section [, section][, section]
bool AsmParser::parseDirectiveCFISections() {
StringRef Name;
bool EH = false;
bool Debug = false;
+ bool SFrame = false;
if (!parseOptionalToken(AsmToken::EndOfStatement)) {
for (;;) {
if (parseIdentifier(Name))
- return TokError("expected .eh_frame or .debug_frame");
+ return TokError("expected .eh_frame, .debug_frame, or .sframe");
if (Name == ".eh_frame")
EH = true;
else if (Name == ".debug_frame")
Debug = true;
+ else if (Name == ".sframe")
+ SFrame = true;
if (parseOptionalToken(AsmToken::EndOfStatement))
break;
if (parseComma())
return true;
}
}
- getStreamer().emitCFISections(EH, Debug);
+ getStreamer().emitCFISections(EH, Debug, SFrame);
return false;
}
diff --git a/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/llvm/lib/MC/MCParser/ELFAsmParser.cpp
index c7c3df3..2e251cc 100644
--- a/llvm/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/ELFAsmParser.cpp
@@ -644,8 +644,8 @@ EndStmt:
}
if (UseLastGroup) {
- if (const MCSectionELF *Section =
- cast_or_null<MCSectionELF>(getStreamer().getCurrentSectionOnly()))
+ if (auto *Section = static_cast<const MCSectionELF *>(
+ getStreamer().getCurrentSectionOnly()))
if (const MCSymbol *Group = Section->getGroup()) {
GroupName = Group->getName();
IsComdat = Section->isComdat();
diff --git a/llvm/lib/MC/MCParser/MasmParser.cpp b/llvm/lib/MC/MCParser/MasmParser.cpp
index f4684e6..780289e 100644
--- a/llvm/lib/MC/MCParser/MasmParser.cpp
+++ b/llvm/lib/MC/MCParser/MasmParser.cpp
@@ -4228,8 +4228,7 @@ bool MasmParser::emitAlignTo(int64_t Alignment) {
// Check whether we should use optimal code alignment for this align
// directive.
const MCSection *Section = getStreamer().getCurrentSectionOnly();
- assert(Section && "must have section to emit alignment");
- if (Section->useCodeAlign()) {
+ if (MAI.useCodeAlign(*Section)) {
getStreamer().emitCodeAlignment(Align(Alignment),
&getTargetParser().getSTI(),
/*MaxBytesToEmit=*/0);
diff --git a/llvm/lib/MC/MCParser/WasmAsmParser.cpp b/llvm/lib/MC/MCParser/WasmAsmParser.cpp
index 1f824b8..d97f4f5 100644
--- a/llvm/lib/MC/MCParser/WasmAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/WasmAsmParser.cpp
@@ -252,7 +252,7 @@ public:
if (TypeName == "function") {
WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
auto *Current =
- cast<MCSectionWasm>(getStreamer().getCurrentSectionOnly());
+ static_cast<MCSectionWasm *>(getStreamer().getCurrentSectionOnly());
if (Current->getGroup())
WasmSym->setComdat(true);
} else if (TypeName == "global")
diff --git a/llvm/lib/MC/MCSection.cpp b/llvm/lib/MC/MCSection.cpp
index 023f7f2..4f28267 100644
--- a/llvm/lib/MC/MCSection.cpp
+++ b/llvm/lib/MC/MCSection.cpp
@@ -18,12 +18,10 @@
using namespace llvm;
-MCSection::MCSection(SectionVariant V, StringRef Name, bool IsText, bool IsBss,
- MCSymbol *Begin)
+MCSection::MCSection(StringRef Name, bool IsText, bool IsBss, MCSymbol *Begin)
: Begin(Begin), HasInstructions(false), IsRegistered(false), IsText(IsText),
- IsBss(IsBss), LinkerRelaxable(false), Name(Name), Variant(V) {
- // The initial subsection number is 0. Create a fragment list.
- CurFragList = &Subsections.emplace_back(0u, FragList{}).second;
+ IsBss(IsBss), LinkerRelaxable(false), Name(Name) {
+ DummyFragment.setParent(this);
}
MCSymbol *MCSection::getEndSymbol(MCContext &Ctx) {
diff --git a/llvm/lib/MC/MCSectionCOFF.cpp b/llvm/lib/MC/MCSectionCOFF.cpp
deleted file mode 100644
index 5bf1473..0000000
--- a/llvm/lib/MC/MCSectionCOFF.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-//===- lib/MC/MCSectionCOFF.cpp - COFF Code Section Representation --------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/MC/MCSectionCOFF.h"
-#include "llvm/BinaryFormat/COFF.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cassert>
-
-using namespace llvm;
-
-// shouldOmitSectionDirective - Decides whether a '.section' directive
-// should be printed before the section name
-bool MCSectionCOFF::shouldOmitSectionDirective(StringRef Name,
- const MCAsmInfo &MAI) const {
- if (COMDATSymbol || isUnique())
- return false;
-
- // FIXME: Does .section .bss/.data/.text work everywhere??
- if (Name == ".text" || Name == ".data" || Name == ".bss")
- return true;
-
- return false;
-}
-
-void MCSectionCOFF::setSelection(int Selection) const {
- assert(Selection != 0 && "invalid COMDAT selection type");
- this->Selection = Selection;
- Characteristics |= COFF::IMAGE_SCN_LNK_COMDAT;
-}
-
-void MCSectionCOFF::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
- raw_ostream &OS,
- uint32_t Subsection) const {
- // standard sections don't require the '.section'
- if (shouldOmitSectionDirective(getName(), MAI)) {
- OS << '\t' << getName() << '\n';
- return;
- }
-
- OS << "\t.section\t" << getName() << ",\"";
- if (getCharacteristics() & COFF::IMAGE_SCN_CNT_INITIALIZED_DATA)
- OS << 'd';
- if (getCharacteristics() & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA)
- OS << 'b';
- if (getCharacteristics() & COFF::IMAGE_SCN_MEM_EXECUTE)
- OS << 'x';
- if (getCharacteristics() & COFF::IMAGE_SCN_MEM_WRITE)
- OS << 'w';
- else if (getCharacteristics() & COFF::IMAGE_SCN_MEM_READ)
- OS << 'r';
- else
- OS << 'y';
- if (getCharacteristics() & COFF::IMAGE_SCN_LNK_REMOVE)
- OS << 'n';
- if (getCharacteristics() & COFF::IMAGE_SCN_MEM_SHARED)
- OS << 's';
- if ((getCharacteristics() & COFF::IMAGE_SCN_MEM_DISCARDABLE) &&
- !isImplicitlyDiscardable(getName()))
- OS << 'D';
- if (getCharacteristics() & COFF::IMAGE_SCN_LNK_INFO)
- OS << 'i';
- OS << '"';
-
- // unique should be tail of .section directive.
- if (isUnique() && !COMDATSymbol)
- OS << ",unique," << UniqueID;
-
- if (getCharacteristics() & COFF::IMAGE_SCN_LNK_COMDAT) {
- if (COMDATSymbol)
- OS << ",";
- else
- OS << "\n\t.linkonce\t";
- switch (Selection) {
- case COFF::IMAGE_COMDAT_SELECT_NODUPLICATES:
- OS << "one_only";
- break;
- case COFF::IMAGE_COMDAT_SELECT_ANY:
- OS << "discard";
- break;
- case COFF::IMAGE_COMDAT_SELECT_SAME_SIZE:
- OS << "same_size";
- break;
- case COFF::IMAGE_COMDAT_SELECT_EXACT_MATCH:
- OS << "same_contents";
- break;
- case COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE:
- OS << "associative";
- break;
- case COFF::IMAGE_COMDAT_SELECT_LARGEST:
- OS << "largest";
- break;
- case COFF::IMAGE_COMDAT_SELECT_NEWEST:
- OS << "newest";
- break;
- default:
- assert(false && "unsupported COFF selection type");
- break;
- }
- if (COMDATSymbol) {
- OS << ",";
- COMDATSymbol->print(OS, &MAI);
- }
- }
-
- if (isUnique() && COMDATSymbol)
- OS << ",unique," << UniqueID;
-
- OS << '\n';
-}
-
-bool MCSectionCOFF::useCodeAlign() const { return isText(); }
diff --git a/llvm/lib/MC/MCSectionDXContainer.cpp b/llvm/lib/MC/MCSectionDXContainer.cpp
deleted file mode 100644
index 7eee59d..0000000
--- a/llvm/lib/MC/MCSectionDXContainer.cpp
+++ /dev/null
@@ -1,15 +0,0 @@
-//===- lib/MC/MCSectionDXContainer.cpp - DXContainer Section --------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/MC/MCSectionDXContainer.h"
-
-using namespace llvm;
-
-void MCSectionDXContainer::printSwitchToSection(const MCAsmInfo &,
- const Triple &, raw_ostream &,
- uint32_t) const {}
diff --git a/llvm/lib/MC/MCSectionELF.cpp b/llvm/lib/MC/MCSectionELF.cpp
deleted file mode 100644
index ef33f9c..0000000
--- a/llvm/lib/MC/MCSectionELF.cpp
+++ /dev/null
@@ -1,217 +0,0 @@
-//===- lib/MC/MCSectionELF.cpp - ELF Code Section Representation ----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/MC/MCSectionELF.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/BinaryFormat/ELF.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/TargetParser/Triple.h"
-#include <cassert>
-
-using namespace llvm;
-
-// Decides whether a '.section' directive
-// should be printed before the section name.
-bool MCSectionELF::shouldOmitSectionDirective(StringRef Name,
- const MCAsmInfo &MAI) const {
- if (isUnique())
- return false;
-
- return MAI.shouldOmitSectionDirective(Name);
-}
-
-static void printName(raw_ostream &OS, StringRef Name) {
- if (Name.find_first_not_of("0123456789_."
- "abcdefghijklmnopqrstuvwxyz"
- "ABCDEFGHIJKLMNOPQRSTUVWXYZ") == Name.npos) {
- OS << Name;
- return;
- }
- OS << '"';
- for (const char *B = Name.begin(), *E = Name.end(); B < E; ++B) {
- if (*B == '"') // Unquoted "
- OS << "\\\"";
- else if (*B != '\\') // Neither " or backslash
- OS << *B;
- else if (B + 1 == E) // Trailing backslash
- OS << "\\\\";
- else {
- OS << B[0] << B[1]; // Quoted character
- ++B;
- }
- }
- OS << '"';
-}
-
-void MCSectionELF::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
- raw_ostream &OS,
- uint32_t Subsection) const {
- if (shouldOmitSectionDirective(getName(), MAI)) {
- OS << '\t' << getName();
- if (Subsection)
- OS << '\t' << Subsection;
- OS << '\n';
- return;
- }
-
- OS << "\t.section\t";
- printName(OS, getName());
-
- // Handle the weird solaris syntax if desired.
- if (MAI.usesSunStyleELFSectionSwitchSyntax() &&
- !(Flags & ELF::SHF_MERGE)) {
- if (Flags & ELF::SHF_ALLOC)
- OS << ",#alloc";
- if (Flags & ELF::SHF_EXECINSTR)
- OS << ",#execinstr";
- if (Flags & ELF::SHF_WRITE)
- OS << ",#write";
- if (Flags & ELF::SHF_EXCLUDE)
- OS << ",#exclude";
- if (Flags & ELF::SHF_TLS)
- OS << ",#tls";
- OS << '\n';
- return;
- }
-
- OS << ",\"";
- if (Flags & ELF::SHF_ALLOC)
- OS << 'a';
- if (Flags & ELF::SHF_EXCLUDE)
- OS << 'e';
- if (Flags & ELF::SHF_EXECINSTR)
- OS << 'x';
- if (Flags & ELF::SHF_WRITE)
- OS << 'w';
- if (Flags & ELF::SHF_MERGE)
- OS << 'M';
- if (Flags & ELF::SHF_STRINGS)
- OS << 'S';
- if (Flags & ELF::SHF_TLS)
- OS << 'T';
- if (Flags & ELF::SHF_LINK_ORDER)
- OS << 'o';
- if (Flags & ELF::SHF_GROUP)
- OS << 'G';
- if (Flags & ELF::SHF_GNU_RETAIN)
- OS << 'R';
-
- // If there are os-specific flags, print them.
- if (T.isOSSolaris())
- if (Flags & ELF::SHF_SUNW_NODISCARD)
- OS << 'R';
-
- // If there are target-specific flags, print them.
- Triple::ArchType Arch = T.getArch();
- if (Arch == Triple::xcore) {
- if (Flags & ELF::XCORE_SHF_CP_SECTION)
- OS << 'c';
- if (Flags & ELF::XCORE_SHF_DP_SECTION)
- OS << 'd';
- } else if (T.isARM() || T.isThumb()) {
- if (Flags & ELF::SHF_ARM_PURECODE)
- OS << 'y';
- } else if (T.isAArch64()) {
- if (Flags & ELF::SHF_AARCH64_PURECODE)
- OS << 'y';
- } else if (Arch == Triple::hexagon) {
- if (Flags & ELF::SHF_HEX_GPREL)
- OS << 's';
- } else if (Arch == Triple::x86_64) {
- if (Flags & ELF::SHF_X86_64_LARGE)
- OS << 'l';
- }
-
- OS << '"';
-
- OS << ',';
-
- // If comment string is '@', e.g. as on ARM - use '%' instead
- if (MAI.getCommentString()[0] == '@')
- OS << '%';
- else
- OS << '@';
-
- if (Type == ELF::SHT_INIT_ARRAY)
- OS << "init_array";
- else if (Type == ELF::SHT_FINI_ARRAY)
- OS << "fini_array";
- else if (Type == ELF::SHT_PREINIT_ARRAY)
- OS << "preinit_array";
- else if (Type == ELF::SHT_NOBITS)
- OS << "nobits";
- else if (Type == ELF::SHT_NOTE)
- OS << "note";
- else if (Type == ELF::SHT_PROGBITS)
- OS << "progbits";
- else if (Type == ELF::SHT_X86_64_UNWIND)
- OS << "unwind";
- else if (Type == ELF::SHT_MIPS_DWARF)
- // Print hex value of the flag while we do not have
- // any standard symbolic representation of the flag.
- OS << "0x7000001e";
- else if (Type == ELF::SHT_LLVM_ODRTAB)
- OS << "llvm_odrtab";
- else if (Type == ELF::SHT_LLVM_LINKER_OPTIONS)
- OS << "llvm_linker_options";
- else if (Type == ELF::SHT_LLVM_CALL_GRAPH_PROFILE)
- OS << "llvm_call_graph_profile";
- else if (Type == ELF::SHT_LLVM_DEPENDENT_LIBRARIES)
- OS << "llvm_dependent_libraries";
- else if (Type == ELF::SHT_LLVM_SYMPART)
- OS << "llvm_sympart";
- else if (Type == ELF::SHT_LLVM_BB_ADDR_MAP)
- OS << "llvm_bb_addr_map";
- else if (Type == ELF::SHT_LLVM_OFFLOADING)
- OS << "llvm_offloading";
- else if (Type == ELF::SHT_LLVM_LTO)
- OS << "llvm_lto";
- else if (Type == ELF::SHT_LLVM_JT_SIZES)
- OS << "llvm_jt_sizes";
- else if (Type == ELF::SHT_LLVM_CFI_JUMP_TABLE)
- OS << "llvm_cfi_jump_table";
- else
- OS << "0x" << Twine::utohexstr(Type);
-
- if (EntrySize) {
- assert((Flags & ELF::SHF_MERGE) || Type == ELF::SHT_LLVM_CFI_JUMP_TABLE);
- OS << "," << EntrySize;
- }
-
- if (Flags & ELF::SHF_LINK_ORDER) {
- OS << ",";
- if (LinkedToSym)
- printName(OS, LinkedToSym->getName());
- else
- OS << '0';
- }
-
- if (Flags & ELF::SHF_GROUP) {
- OS << ",";
- printName(OS, Group.getPointer()->getName());
- if (isComdat())
- OS << ",comdat";
- }
-
- if (isUnique())
- OS << ",unique," << UniqueID;
-
- OS << '\n';
-
- if (Subsection) {
- OS << "\t.subsection\t" << Subsection;
- OS << '\n';
- }
-}
-
-bool MCSectionELF::useCodeAlign() const {
- return getFlags() & ELF::SHF_EXECINSTR;
-}
diff --git a/llvm/lib/MC/MCSectionGOFF.cpp b/llvm/lib/MC/MCSectionGOFF.cpp
deleted file mode 100644
index 8163e5b..0000000
--- a/llvm/lib/MC/MCSectionGOFF.cpp
+++ /dev/null
@@ -1,143 +0,0 @@
-//===- MCSectionGOFF.cpp - GOFF Code Section Representation ---------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/MC/MCSectionGOFF.h"
-#include "llvm/BinaryFormat/GOFF.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-static void emitCATTR(raw_ostream &OS, StringRef Name, GOFF::ESDRmode Rmode,
- GOFF::ESDAlignment Alignment,
- GOFF::ESDLoadingBehavior LoadBehavior,
- GOFF::ESDExecutable Executable, bool IsReadOnly,
- uint32_t SortKey, uint8_t FillByteValue,
- StringRef PartName) {
- OS << Name << " CATTR ";
- OS << "ALIGN(" << static_cast<unsigned>(Alignment) << "),"
- << "FILL(" << static_cast<unsigned>(FillByteValue) << ")";
- switch (LoadBehavior) {
- case GOFF::ESD_LB_Deferred:
- OS << ",DEFLOAD";
- break;
- case GOFF::ESD_LB_NoLoad:
- OS << ",NOLOAD";
- break;
- default:
- break;
- }
- switch (Executable) {
- case GOFF::ESD_EXE_CODE:
- OS << ",EXECUTABLE";
- break;
- case GOFF::ESD_EXE_DATA:
- OS << ",NOTEXECUTABLE";
- break;
- default:
- break;
- }
- if (IsReadOnly)
- OS << ",READONLY";
- if (Rmode != GOFF::ESD_RMODE_None) {
- OS << ',';
- OS << "RMODE(";
- switch (Rmode) {
- case GOFF::ESD_RMODE_24:
- OS << "24";
- break;
- case GOFF::ESD_RMODE_31:
- OS << "31";
- break;
- case GOFF::ESD_RMODE_64:
- OS << "64";
- break;
- case GOFF::ESD_RMODE_None:
- break;
- }
- OS << ')';
- }
- if (SortKey)
- OS << ",PRIORITY(" << SortKey << ")";
- if (!PartName.empty())
- OS << ",PART(" << PartName << ")";
- OS << '\n';
-}
-
-static void emitXATTR(raw_ostream &OS, StringRef Name,
- GOFF::ESDLinkageType Linkage,
- GOFF::ESDExecutable Executable,
- GOFF::ESDBindingScope BindingScope) {
- OS << Name << " XATTR ";
- OS << "LINKAGE(" << (Linkage == GOFF::ESD_LT_OS ? "OS" : "XPLINK") << "),";
- if (Executable != GOFF::ESD_EXE_Unspecified)
- OS << "REFERENCE(" << (Executable == GOFF::ESD_EXE_CODE ? "CODE" : "DATA")
- << "),";
- if (BindingScope != GOFF::ESD_BSC_Unspecified) {
- OS << "SCOPE(";
- switch (BindingScope) {
- case GOFF::ESD_BSC_Section:
- OS << "SECTION";
- break;
- case GOFF::ESD_BSC_Module:
- OS << "MODULE";
- break;
- case GOFF::ESD_BSC_Library:
- OS << "LIBRARY";
- break;
- case GOFF::ESD_BSC_ImportExport:
- OS << "EXPORT";
- break;
- default:
- break;
- }
- OS << ')';
- }
- OS << '\n';
-}
-
-void MCSectionGOFF::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
- raw_ostream &OS,
- uint32_t Subsection) const {
- switch (SymbolType) {
- case GOFF::ESD_ST_SectionDefinition: {
- OS << Name << " CSECT\n";
- Emitted = true;
- break;
- }
- case GOFF::ESD_ST_ElementDefinition: {
- getParent()->printSwitchToSection(MAI, T, OS, Subsection);
- if (!Emitted) {
- emitCATTR(OS, Name, EDAttributes.Rmode, EDAttributes.Alignment,
- EDAttributes.LoadBehavior, GOFF::ESD_EXE_Unspecified,
- EDAttributes.IsReadOnly, 0, EDAttributes.FillByteValue,
- StringRef());
- Emitted = true;
- } else
- OS << Name << " CATTR\n";
- break;
- }
- case GOFF::ESD_ST_PartReference: {
- MCSectionGOFF *ED = getParent();
- ED->getParent()->printSwitchToSection(MAI, T, OS, Subsection);
- if (!Emitted) {
- emitCATTR(OS, ED->getName(), ED->getEDAttributes().Rmode,
- ED->EDAttributes.Alignment, ED->EDAttributes.LoadBehavior,
- PRAttributes.Executable, ED->EDAttributes.IsReadOnly,
- PRAttributes.SortKey, ED->EDAttributes.FillByteValue, Name);
- emitXATTR(OS, Name, PRAttributes.Linkage, PRAttributes.Executable,
- PRAttributes.BindingScope);
- ED->Emitted = true;
- Emitted = true;
- } else
- OS << ED->getName() << " CATTR PART(" << Name << ")\n";
- break;
- }
- default:
- llvm_unreachable("Wrong section type");
- }
-} \ No newline at end of file
diff --git a/llvm/lib/MC/MCSectionMachO.cpp b/llvm/lib/MC/MCSectionMachO.cpp
index 67453ce..67c8235 100644
--- a/llvm/lib/MC/MCSectionMachO.cpp
+++ b/llvm/lib/MC/MCSectionMachO.cpp
@@ -7,6 +7,7 @@
//===----------------------------------------------------------------------===//
#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCAsmInfoDarwin.h"
#include "llvm/MC/SectionKind.h"
#include "llvm/Support/raw_ostream.h"
@@ -92,7 +93,7 @@ ENTRY("" /*FIXME*/, S_ATTR_LOC_RELOC)
MCSectionMachO::MCSectionMachO(StringRef Segment, StringRef Section,
unsigned TAA, unsigned reserved2, SectionKind K,
MCSymbol *Begin)
- : MCSection(SV_MachO, Section, K.isText(),
+ : MCSection(Section, K.isText(),
MachO::isVirtualSection(TAA & MachO::SECTION_TYPE), Begin),
TypeAndAttributes(TAA), Reserved2(reserved2) {
assert(Segment.size() <= 16 && Section.size() <= 16 &&
@@ -105,19 +106,20 @@ MCSectionMachO::MCSectionMachO(StringRef Segment, StringRef Section,
}
}
-void MCSectionMachO::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
- raw_ostream &OS,
- uint32_t Subsection) const {
- OS << "\t.section\t" << getSegmentName() << ',' << getName();
+void MCAsmInfoDarwin::printSwitchToSection(const MCSection &Section, uint32_t,
+ const Triple &T,
+ raw_ostream &OS) const {
+ auto &Sec = static_cast<const MCSectionMachO &>(Section);
+ OS << "\t.section\t" << Sec.getSegmentName() << ',' << Sec.getName();
// Get the section type and attributes.
- unsigned TAA = getTypeAndAttributes();
+ unsigned TAA = Sec.getTypeAndAttributes();
if (TAA == 0) {
OS << '\n';
return;
}
- MachO::SectionType SectionType = getType();
+ MachO::SectionType SectionType = Sec.getType();
assert(SectionType <= MachO::LAST_KNOWN_SECTION_TYPE &&
"Invalid SectionType specified!");
@@ -135,8 +137,8 @@ void MCSectionMachO::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
if (SectionAttrs == 0) {
// If we have a S_SYMBOL_STUBS size specified, print it along with 'none' as
// the attribute specifier.
- if (Reserved2 != 0)
- OS << ",none," << Reserved2;
+ if (Sec.Reserved2 != 0)
+ OS << ",none," << Sec.Reserved2;
OS << '\n';
return;
}
@@ -164,15 +166,11 @@ void MCSectionMachO::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
assert(SectionAttrs == 0 && "Unknown section attributes!");
// If we have a S_SYMBOL_STUBS size specified, print it.
- if (Reserved2 != 0)
- OS << ',' << Reserved2;
+ if (Sec.Reserved2 != 0)
+ OS << ',' << Sec.Reserved2;
OS << '\n';
}
-bool MCSectionMachO::useCodeAlign() const {
- return hasAttribute(MachO::S_ATTR_PURE_INSTRUCTIONS);
-}
-
/// ParseSectionSpecifier - Parse the section specifier indicated by "Spec".
/// This is a string that can appear after a .section directive in a mach-o
/// flavored .s file. If successful, this fills in the specified Out
diff --git a/llvm/lib/MC/MCSectionWasm.cpp b/llvm/lib/MC/MCSectionWasm.cpp
deleted file mode 100644
index e25af1c..0000000
--- a/llvm/lib/MC/MCSectionWasm.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-//===- lib/MC/MCSectionWasm.cpp - Wasm Code Section Representation --------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/MC/MCSectionWasm.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCSymbolWasm.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-// Decides whether a '.section' directive
-// should be printed before the section name.
-bool MCSectionWasm::shouldOmitSectionDirective(StringRef Name,
- const MCAsmInfo &MAI) const {
- return MAI.shouldOmitSectionDirective(Name);
-}
-
-static void printName(raw_ostream &OS, StringRef Name) {
- if (Name.find_first_not_of("0123456789_."
- "abcdefghijklmnopqrstuvwxyz"
- "ABCDEFGHIJKLMNOPQRSTUVWXYZ") == Name.npos) {
- OS << Name;
- return;
- }
- OS << '"';
- for (const char *B = Name.begin(), *E = Name.end(); B < E; ++B) {
- if (*B == '"') // Unquoted "
- OS << "\\\"";
- else if (*B != '\\') // Neither " or backslash
- OS << *B;
- else if (B + 1 == E) // Trailing backslash
- OS << "\\\\";
- else {
- OS << B[0] << B[1]; // Quoted character
- ++B;
- }
- }
- OS << '"';
-}
-
-void MCSectionWasm::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
- raw_ostream &OS,
- uint32_t Subsection) const {
-
- if (shouldOmitSectionDirective(getName(), MAI)) {
- OS << '\t' << getName();
- if (Subsection)
- OS << '\t' << Subsection;
- OS << '\n';
- return;
- }
-
- OS << "\t.section\t";
- printName(OS, getName());
- OS << ",\"";
-
- if (IsPassive)
- OS << 'p';
- if (Group)
- OS << 'G';
- if (SegmentFlags & wasm::WASM_SEG_FLAG_STRINGS)
- OS << 'S';
- if (SegmentFlags & wasm::WASM_SEG_FLAG_TLS)
- OS << 'T';
- if (SegmentFlags & wasm::WASM_SEG_FLAG_RETAIN)
- OS << 'R';
-
- OS << '"';
-
- OS << ',';
-
- // If comment string is '@', e.g. as on ARM - use '%' instead
- if (MAI.getCommentString()[0] == '@')
- OS << '%';
- else
- OS << '@';
-
- // TODO: Print section type.
-
- if (Group) {
- OS << ",";
- printName(OS, Group->getName());
- OS << ",comdat";
- }
-
- if (isUnique())
- OS << ",unique," << UniqueID;
-
- OS << '\n';
-
- if (Subsection)
- OS << "\t.subsection\t" << Subsection << '\n';
-}
-
-bool MCSectionWasm::useCodeAlign() const { return false; }
diff --git a/llvm/lib/MC/MCSectionXCOFF.cpp b/llvm/lib/MC/MCSectionXCOFF.cpp
deleted file mode 100644
index 41043b2..0000000
--- a/llvm/lib/MC/MCSectionXCOFF.cpp
+++ /dev/null
@@ -1,134 +0,0 @@
-//===- lib/MC/MCSectionXCOFF.cpp - XCOFF Code Section Representation ------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/MC/MCSectionXCOFF.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
-namespace llvm {
-class MCExpr;
-class Triple;
-} // namespace llvm
-
-using namespace llvm;
-
-MCSectionXCOFF::~MCSectionXCOFF() = default;
-
-void MCSectionXCOFF::printCsectDirective(raw_ostream &OS) const {
- OS << "\t.csect " << QualName->getName() << "," << Log2(getAlign()) << '\n';
-}
-
-void MCSectionXCOFF::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
- raw_ostream &OS,
- uint32_t Subsection) const {
- if (getKind().isText()) {
- if (getMappingClass() != XCOFF::XMC_PR)
- report_fatal_error("Unhandled storage-mapping class for .text csect");
-
- printCsectDirective(OS);
- return;
- }
-
- if (getKind().isReadOnly()) {
- if (getMappingClass() != XCOFF::XMC_RO &&
- getMappingClass() != XCOFF::XMC_TD)
- report_fatal_error("Unhandled storage-mapping class for .rodata csect.");
- printCsectDirective(OS);
- return;
- }
-
- if (getKind().isReadOnlyWithRel()) {
- if (getMappingClass() != XCOFF::XMC_RW &&
- getMappingClass() != XCOFF::XMC_RO &&
- getMappingClass() != XCOFF::XMC_TD)
- report_fatal_error(
- "Unexepected storage-mapping class for ReadOnlyWithRel kind");
- printCsectDirective(OS);
- return;
- }
-
- // Initialized TLS data.
- if (getKind().isThreadData()) {
- // We only expect XMC_TL here for initialized TLS data.
- if (getMappingClass() != XCOFF::XMC_TL)
- report_fatal_error("Unhandled storage-mapping class for .tdata csect.");
- printCsectDirective(OS);
- return;
- }
-
- if (getKind().isData()) {
- switch (getMappingClass()) {
- case XCOFF::XMC_RW:
- case XCOFF::XMC_DS:
- case XCOFF::XMC_TD:
- printCsectDirective(OS);
- break;
- case XCOFF::XMC_TC:
- case XCOFF::XMC_TE:
- break;
- case XCOFF::XMC_TC0:
- OS << "\t.toc\n";
- break;
- default:
- report_fatal_error(
- "Unhandled storage-mapping class for .data csect.");
- }
- return;
- }
-
- if (isCsect() && getMappingClass() == XCOFF::XMC_TD) {
- // Common csect type (uninitialized storage) does not have to print csect
- // directive for section switching unless it is local.
- if (getKind().isCommon() && !getKind().isBSSLocal())
- return;
-
- assert(getKind().isBSS() && "Unexpected section kind for toc-data");
- printCsectDirective(OS);
- return;
- }
- // Common csect type (uninitialized storage) does not have to print csect
- // directive for section switching.
- if (isCsect() && getCSectType() == XCOFF::XTY_CM) {
- assert((getMappingClass() == XCOFF::XMC_RW ||
- getMappingClass() == XCOFF::XMC_BS ||
- getMappingClass() == XCOFF::XMC_UL) &&
- "Generated a storage-mapping class for a common/bss/tbss csect we "
- "don't "
- "understand how to switch to.");
- // Common symbols and local zero-initialized symbols for TLS and Non-TLS are
- // eligible for .bss/.tbss csect, getKind().isThreadBSS() is used to cover
- // TLS common and zero-initialized local symbols since linkage type (in the
- // GlobalVariable) is not accessible in this class.
- assert((getKind().isBSSLocal() || getKind().isCommon() ||
- getKind().isThreadBSS()) &&
- "wrong symbol type for .bss/.tbss csect");
- // Don't have to print a directive for switching to section for commons and
- // zero-initialized TLS data. The '.comm' and '.lcomm' directives of the
- // variable will create the needed csect.
- return;
- }
-
- // Zero-initialized TLS data with weak or external linkage are not eligible to
- // be put into common csect.
- if (getKind().isThreadBSS()) {
- printCsectDirective(OS);
- return;
- }
-
- // XCOFF debug sections.
- if (getKind().isMetadata() && isDwarfSect()) {
- OS << "\n\t.dwsect " << format("0x%" PRIx32, *getDwarfSubtypeFlags())
- << '\n';
- OS << getName() << ':' << '\n';
- return;
- }
-
- report_fatal_error("Printing for this SectionKind is unimplemented.");
-}
-
-bool MCSectionXCOFF::useCodeAlign() const { return getKind().isText(); }
diff --git a/llvm/lib/MC/MCStreamer.cpp b/llvm/lib/MC/MCStreamer.cpp
index 30198c9..bc73981 100644
--- a/llvm/lib/MC/MCStreamer.cpp
+++ b/llvm/lib/MC/MCStreamer.cpp
@@ -56,12 +56,11 @@ void MCTargetStreamer::finish() {}
void MCTargetStreamer::emitConstantPools() {}
-void MCTargetStreamer::changeSection(const MCSection *CurSection,
- MCSection *Section, uint32_t Subsection,
- raw_ostream &OS) {
- Section->printSwitchToSection(*Streamer.getContext().getAsmInfo(),
- Streamer.getContext().getTargetTriple(), OS,
- Subsection);
+void MCTargetStreamer::changeSection(const MCSection *, MCSection *Sec,
+ uint32_t Subsection, raw_ostream &OS) {
+ auto &MAI = *Streamer.getContext().getAsmInfo();
+ MAI.printSwitchToSection(*Sec, Subsection,
+ Streamer.getContext().getTargetTriple(), OS);
}
void MCTargetStreamer::emitDwarfFileDirective(StringRef Directive) {
@@ -415,7 +414,7 @@ void MCStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) {
void MCStreamer::emitConditionalAssignment(MCSymbol *Symbol,
const MCExpr *Value) {}
-void MCStreamer::emitCFISections(bool EH, bool Debug) {}
+void MCStreamer::emitCFISections(bool EH, bool Debug, bool SFrame) {}
void MCStreamer::emitCFIStartProc(bool IsSimple, SMLoc Loc) {
if (!FrameInfoStack.empty() &&
@@ -838,8 +837,8 @@ static MCSection *getWinCFISection(MCContext &Context, unsigned *NextWinCFIID,
if (TextSec == Context.getObjectFileInfo()->getTextSection())
return MainCFISec;
- const auto *TextSecCOFF = cast<MCSectionCOFF>(TextSec);
- auto *MainCFISecCOFF = cast<MCSectionCOFF>(MainCFISec);
+ const auto *TextSecCOFF = static_cast<const MCSectionCOFF *>(TextSec);
+ auto *MainCFISecCOFF = static_cast<MCSectionCOFF *>(MainCFISec);
unsigned UniqueID = TextSecCOFF->getOrAssignWinCFISectionID(NextWinCFIID);
// If this section is COMDAT, this unwind section should be COMDAT associative
@@ -1314,9 +1313,20 @@ void MCStreamer::emitZerofill(MCSection *, MCSymbol *, uint64_t, Align, SMLoc) {
}
void MCStreamer::emitTBSSSymbol(MCSection *Section, MCSymbol *Symbol,
uint64_t Size, Align ByteAlignment) {}
-void MCStreamer::changeSection(MCSection *Section, uint32_t) {
- CurFrag = &Section->getDummyFragment();
+
+void MCStreamer::changeSection(MCSection *Sec, uint32_t) {
+ CurFrag = &Sec->getDummyFragment();
+ auto *Sym = Sec->getBeginSymbol();
+ if (!Sym || !Sym->isUndefined())
+ return;
+ // In Mach-O, DWARF sections use Begin as a temporary label, requiring a label
+ // definition, unlike section symbols in other file formats.
+ if (getContext().getObjectFileType() == MCContext::IsMachO)
+ emitLabel(Sym);
+ else
+ Sym->setFragment(CurFrag);
}
+
void MCStreamer::emitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) {}
void MCStreamer::emitBytes(StringRef Data) {}
void MCStreamer::emitBinaryData(StringRef Data) { emitBytes(Data); }
@@ -1358,9 +1368,6 @@ void MCStreamer::switchSection(MCSection *Section, uint32_t Subsection) {
changeSection(Section, Subsection);
SectionStack.back().first = MCSectionSubPair(Section, Subsection);
assert(!Section->hasEnded() && "Section already ended");
- MCSymbol *Sym = Section->getBeginSymbol();
- if (Sym && !Sym->isInSection())
- emitLabel(Sym);
}
}
@@ -1387,9 +1394,6 @@ void MCStreamer::switchSectionNoPrint(MCSection *Section) {
SectionStack.back().second = SectionStack.back().first;
SectionStack.back().first = MCSectionSubPair(Section, 0);
changeSection(Section, 0);
- MCSymbol *Sym = Section->getBeginSymbol();
- if (Sym && !Sym->isInSection())
- emitLabel(Sym);
}
MCSymbol *MCStreamer::endSection(MCSection *Section) {
diff --git a/llvm/lib/MC/MCTargetOptions.cpp b/llvm/lib/MC/MCTargetOptions.cpp
index bff4b8d..be6d19d 100644
--- a/llvm/lib/MC/MCTargetOptions.cpp
+++ b/llvm/lib/MC/MCTargetOptions.cpp
@@ -19,7 +19,8 @@ MCTargetOptions::MCTargetOptions()
PreserveAsmComments(true), Dwarf64(false),
EmitDwarfUnwind(EmitDwarfUnwindType::Default),
MCUseDwarfDirectory(DefaultDwarfDirectory),
- EmitCompactUnwindNonCanonical(false), PPCUseFullRegisterNames(false) {}
+ EmitCompactUnwindNonCanonical(false), EmitSFrameUnwind(false),
+ PPCUseFullRegisterNames(false) {}
StringRef MCTargetOptions::getABIName() const {
return ABIName;
diff --git a/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp b/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp
index 2adc291..ff95ff7 100644
--- a/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp
+++ b/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp
@@ -41,6 +41,7 @@ MCOPT(int, DwarfVersion)
MCOPT(bool, Dwarf64)
MCOPT(EmitDwarfUnwindType, EmitDwarfUnwind)
MCOPT(bool, EmitCompactUnwindNonCanonical)
+MCOPT(bool, EmitSFrameUnwind)
MCOPT(bool, ShowMCInst)
MCOPT(bool, FatalWarnings)
MCOPT(bool, NoWarn)
@@ -105,6 +106,11 @@ llvm::mc::RegisterMCTargetOptionsFlags::RegisterMCTargetOptionsFlags() {
false)); // By default, use DWARF for non-canonical personalities.
MCBINDOPT(EmitCompactUnwindNonCanonical);
+ static cl::opt<bool> EmitSFrameUnwind(
+ "gsframe", cl::desc("Whether to emit .sframe unwind sections."),
+ cl::init(false));
+ MCBINDOPT(EmitSFrameUnwind);
+
static cl::opt<bool> ShowMCInst(
"asm-show-inst",
cl::desc("Emit internal instruction representation to assembly file"));
@@ -188,6 +194,7 @@ MCTargetOptions llvm::mc::InitMCTargetOptionsFromFlags() {
Options.X86Sse2Avx = getX86Sse2Avx();
Options.EmitDwarfUnwind = getEmitDwarfUnwind();
Options.EmitCompactUnwindNonCanonical = getEmitCompactUnwindNonCanonical();
+ Options.EmitSFrameUnwind = getEmitSFrameUnwind();
Options.AsSecureLogFile = getAsSecureLogFile();
return Options;
diff --git a/llvm/lib/MC/MCWasmStreamer.cpp b/llvm/lib/MC/MCWasmStreamer.cpp
index 5891420c..e3ef111 100644
--- a/llvm/lib/MC/MCWasmStreamer.cpp
+++ b/llvm/lib/MC/MCWasmStreamer.cpp
@@ -58,7 +58,7 @@ void MCWasmStreamer::emitLabelAtPos(MCSymbol *S, SMLoc Loc, MCFragment &F,
void MCWasmStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
MCAssembler &Asm = getAssembler();
- auto *SectionWasm = cast<MCSectionWasm>(Section);
+ auto *SectionWasm = static_cast<const MCSectionWasm *>(Section);
const MCSymbol *Grp = SectionWasm->getGroup();
if (Grp)
Asm.registerSymbol(*Grp);
diff --git a/llvm/lib/MC/MCWinCOFFStreamer.cpp b/llvm/lib/MC/MCWinCOFFStreamer.cpp
index 9369bea..1ffe25c 100644
--- a/llvm/lib/MC/MCWinCOFFStreamer.cpp
+++ b/llvm/lib/MC/MCWinCOFFStreamer.cpp
@@ -157,7 +157,8 @@ void MCWinCOFFStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
// Ensure that the first and the second symbols relative to the section are
// the section symbol and the COMDAT symbol.
getAssembler().registerSymbol(*Section->getBeginSymbol());
- if (auto *Sym = cast<MCSectionCOFF>(Section)->getCOMDATSymbol())
+ if (auto *Sym =
+ static_cast<const MCSectionCOFF *>(Section)->getCOMDATSymbol())
getAssembler().registerSymbol(*Sym);
}
diff --git a/llvm/lib/MC/MCXCOFFStreamer.cpp b/llvm/lib/MC/MCXCOFFStreamer.cpp
index 63381b4..898ac5d 100644
--- a/llvm/lib/MC/MCXCOFFStreamer.cpp
+++ b/llvm/lib/MC/MCXCOFFStreamer.cpp
@@ -36,6 +36,20 @@ XCOFFObjectWriter &MCXCOFFStreamer::getWriter() {
return static_cast<XCOFFObjectWriter &>(getAssembler().getWriter());
}
+void MCXCOFFStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
+ MCObjectStreamer::changeSection(Section, Subsection);
+ auto *Sec = static_cast<const MCSectionXCOFF *>(Section);
+ // We might miss calculating the symbols difference as absolute value before
+ // adding fixups when symbol_A without the fragment set is the csect itself
+ // and symbol_B is in it.
+ // TODO: Currently we only set the fragment for XMC_PR csects and DWARF
+ // sections because we don't have other cases that hit this problem yet.
+ // if (IsDwarfSec || CsectProp->MappingClass == XCOFF::XMC_PR)
+ // QualName->setFragment(F);
+ if (Sec->isDwarfSect() || Sec->getMappingClass() == XCOFF::XMC_PR)
+ Sec->getQualNameSymbol()->setFragment(CurFrag);
+}
+
bool MCXCOFFStreamer::emitSymbolAttribute(MCSymbol *Sym,
MCSymbolAttr Attribute) {
auto *Symbol = cast<MCSymbolXCOFF>(Sym);
diff --git a/llvm/lib/MC/MachObjectWriter.cpp b/llvm/lib/MC/MachObjectWriter.cpp
index 48d2fc6..7b5c3c0 100644
--- a/llvm/lib/MC/MachObjectWriter.cpp
+++ b/llvm/lib/MC/MachObjectWriter.cpp
@@ -126,7 +126,8 @@ uint64_t MachObjectWriter::getSymbolAddress(const MCSymbol &S) const {
uint64_t MachObjectWriter::getPaddingSize(const MCAssembler &Asm,
const MCSection *Sec) const {
uint64_t EndAddr = getSectionAddress(Sec) + Asm.getSectionAddressSize(*Sec);
- unsigned Next = cast<MCSectionMachO>(Sec)->getLayoutOrder() + 1;
+ unsigned Next =
+ static_cast<const MCSectionMachO *>(Sec)->getLayoutOrder() + 1;
if (Next >= SectionOrder.size())
return 0;
@@ -259,15 +260,12 @@ void MachObjectWriter::writeSegmentLoadCommand(
}
void MachObjectWriter::writeSection(const MCAssembler &Asm,
- const MCSection &Sec, uint64_t VMAddr,
+ const MCSectionMachO &Sec, uint64_t VMAddr,
uint64_t FileOffset, unsigned Flags,
uint64_t RelocationsStart,
unsigned NumRelocations) {
- uint64_t SectionSize = Asm.getSectionAddressSize(Sec);
- const MCSectionMachO &Section = cast<MCSectionMachO>(Sec);
-
// The offset is unused for virtual sections.
- if (Section.isBssSection()) {
+ if (Sec.isBssSection()) {
assert(Asm.getSectionFileSize(Sec) == 0 && "Invalid file size!");
FileOffset = 0;
}
@@ -275,11 +273,11 @@ void MachObjectWriter::writeSection(const MCAssembler &Asm,
// struct section (68 bytes) or
// struct section_64 (80 bytes)
+ uint64_t SectionSize = Asm.getSectionAddressSize(Sec);
uint64_t Start = W.OS.tell();
(void) Start;
-
- writeWithPadding(Section.getName(), 16);
- writeWithPadding(Section.getSegmentName(), 16);
+ writeWithPadding(Sec.getName(), 16);
+ writeWithPadding(Sec.getSegmentName(), 16);
if (is64Bit()) {
W.write<uint64_t>(VMAddr); // address
W.write<uint64_t>(SectionSize); // size
@@ -290,14 +288,14 @@ void MachObjectWriter::writeSection(const MCAssembler &Asm,
assert(isUInt<32>(FileOffset) && "Cannot encode offset of section");
W.write<uint32_t>(FileOffset);
- W.write<uint32_t>(Log2(Section.getAlign()));
+ W.write<uint32_t>(Log2(Sec.getAlign()));
assert((!NumRelocations || isUInt<32>(RelocationsStart)) &&
"Cannot encode offset of relocations");
W.write<uint32_t>(NumRelocations ? RelocationsStart : 0);
W.write<uint32_t>(NumRelocations);
W.write<uint32_t>(Flags);
W.write<uint32_t>(IndirectSymBase.lookup(&Sec)); // reserved1
- W.write<uint32_t>(Section.getStubSize()); // reserved2
+ W.write<uint32_t>(Sec.getStubSize()); // reserved2
if (is64Bit())
W.write<uint32_t>(0); // reserved3
@@ -531,7 +529,7 @@ void MachObjectWriter::bindIndirectSymbols(MCAssembler &Asm) {
// Report errors for use of .indirect_symbol not in a symbol pointer section
// or stub section.
for (IndirectSymbolData &ISD : IndirectSymbols) {
- const MCSectionMachO &Section = cast<MCSectionMachO>(*ISD.Section);
+ const MCSectionMachO &Section = static_cast<MCSectionMachO &>(*ISD.Section);
if (Section.getType() != MachO::S_NON_LAZY_SYMBOL_POINTERS &&
Section.getType() != MachO::S_LAZY_SYMBOL_POINTERS &&
@@ -545,7 +543,7 @@ void MachObjectWriter::bindIndirectSymbols(MCAssembler &Asm) {
// Bind non-lazy symbol pointers first.
for (auto [IndirectIndex, ISD] : enumerate(IndirectSymbols)) {
- const auto &Section = cast<MCSectionMachO>(*ISD.Section);
+ const auto &Section = static_cast<MCSectionMachO &>(*ISD.Section);
if (Section.getType() != MachO::S_NON_LAZY_SYMBOL_POINTERS &&
Section.getType() != MachO::S_THREAD_LOCAL_VARIABLE_POINTERS)
@@ -559,7 +557,7 @@ void MachObjectWriter::bindIndirectSymbols(MCAssembler &Asm) {
// Then lazy symbol pointers and symbol stubs.
for (auto [IndirectIndex, ISD] : enumerate(IndirectSymbols)) {
- const auto &Section = cast<MCSectionMachO>(*ISD.Section);
+ const auto &Section = static_cast<MCSectionMachO &>(*ISD.Section);
if (Section.getType() != MachO::S_LAZY_SYMBOL_POINTERS &&
Section.getType() != MachO::S_SYMBOL_STUBS)
@@ -684,13 +682,13 @@ void MachObjectWriter::computeSectionAddresses(const MCAssembler &Asm) {
for (MCSection &Sec : Asm) {
if (!Sec.isBssSection()) {
SectionOrder.push_back(&Sec);
- cast<MCSectionMachO>(Sec).setLayoutOrder(i++);
+ static_cast<MCSectionMachO &>(Sec).setLayoutOrder(i++);
}
}
for (MCSection &Sec : Asm) {
if (Sec.isBssSection()) {
SectionOrder.push_back(&Sec);
- cast<MCSectionMachO>(Sec).setLayoutOrder(i++);
+ static_cast<MCSectionMachO &>(Sec).setLayoutOrder(i++);
}
}
@@ -907,7 +905,7 @@ uint64_t MachObjectWriter::writeObject() {
// ... and then the section headers.
uint64_t RelocTableEnd = SectionDataStart + SectionDataFileSize;
for (const MCSection &Section : Asm) {
- const auto &Sec = cast<MCSectionMachO>(Section);
+ const auto &Sec = static_cast<const MCSectionMachO &>(Section);
std::vector<RelAndSymbol> &Relocs = Relocations[&Sec];
unsigned NumRelocs = Relocs.size();
uint64_t SectionStart = SectionDataStart + getSectionAddress(&Sec);
diff --git a/llvm/lib/MC/WasmObjectWriter.cpp b/llvm/lib/MC/WasmObjectWriter.cpp
index 3b99af4..bfd6334 100644
--- a/llvm/lib/MC/WasmObjectWriter.cpp
+++ b/llvm/lib/MC/WasmObjectWriter.cpp
@@ -480,7 +480,7 @@ void WasmObjectWriter::recordRelocation(const MCFragment &F,
// The WebAssembly backend should never generate FKF_IsPCRel fixups
assert(!Fixup.isPCRel());
- const auto &FixupSection = cast<MCSectionWasm>(*F.getParent());
+ const auto &FixupSection = static_cast<MCSectionWasm &>(*F.getParent());
uint64_t C = Target.getConstant();
uint64_t FixupOffset = Asm->getFragmentOffset(F) + Fixup.getOffset();
MCContext &Ctx = getContext();
diff --git a/llvm/lib/MC/WinCOFFObjectWriter.cpp b/llvm/lib/MC/WinCOFFObjectWriter.cpp
index 6ad4334..856850d 100644
--- a/llvm/lib/MC/WinCOFFObjectWriter.cpp
+++ b/llvm/lib/MC/WinCOFFObjectWriter.cpp
@@ -373,7 +373,7 @@ void WinCOFFWriter::defineSymbol(const MCSymbol &MCSym) {
COFFSection *Sec = nullptr;
MCSectionCOFF *MCSec = nullptr;
if (Base && Base->getFragment()) {
- MCSec = cast<MCSectionCOFF>(Base->getFragment()->getParent());
+ MCSec = static_cast<MCSectionCOFF *>(Base->getFragment()->getParent());
Sec = SectionMap[MCSec];
}
@@ -1057,7 +1057,8 @@ uint64_t WinCOFFWriter::writeObject() {
continue;
}
- const auto *AssocMCSec = cast<MCSectionCOFF>(&AssocMCSym->getSection());
+ const auto *AssocMCSec =
+ static_cast<const MCSectionCOFF *>(&AssocMCSym->getSection());
assert(SectionMap.count(AssocMCSec));
COFFSection *AssocSec = SectionMap[AssocMCSec];
diff --git a/llvm/lib/MC/XCOFFObjectWriter.cpp b/llvm/lib/MC/XCOFFObjectWriter.cpp
index 2f6785f..65f543b 100644
--- a/llvm/lib/MC/XCOFFObjectWriter.cpp
+++ b/llvm/lib/MC/XCOFFObjectWriter.cpp
@@ -550,13 +550,13 @@ CsectGroup &XCOFFWriter::getCsectGroup(const MCSectionXCOFF *MCSec) {
static MCSectionXCOFF *getContainingCsect(const MCSymbolXCOFF *XSym) {
if (XSym->isDefined())
- return cast<MCSectionXCOFF>(XSym->getFragment()->getParent());
+ return static_cast<MCSectionXCOFF *>(XSym->getFragment()->getParent());
return XSym->getRepresentedCsect();
}
void XCOFFWriter::executePostLayoutBinding() {
for (const auto &S : *Asm) {
- const auto *MCSec = cast<const MCSectionXCOFF>(&S);
+ auto *MCSec = static_cast<const MCSectionXCOFF *>(&S);
assert(!SectionMap.contains(MCSec) && "Cannot add a section twice.");
// If the name does not fit in the storage provided in the symbol table
@@ -747,7 +747,7 @@ void XCOFFWriter::recordRelocation(const MCFragment &F, const MCFixup &Fixup,
FixedValue = TOCEntryOffset;
}
} else if (Type == XCOFF::RelocationType::R_RBR) {
- MCSectionXCOFF *ParentSec = cast<MCSectionXCOFF>(F.getParent());
+ auto *ParentSec = static_cast<MCSectionXCOFF *>(F.getParent());
assert((SymASec->getMappingClass() == XCOFF::XMC_PR &&
ParentSec->getMappingClass() == XCOFF::XMC_PR) &&
"Only XMC_PR csect may have the R_RBR relocation.");
@@ -768,7 +768,7 @@ void XCOFFWriter::recordRelocation(const MCFragment &F, const MCFixup &Fixup,
}
XCOFFRelocation Reloc = {Index, FixupOffsetInCsect, SignAndSize, Type};
- MCSectionXCOFF *RelocationSec = cast<MCSectionXCOFF>(F.getParent());
+ auto *RelocationSec = static_cast<MCSectionXCOFF *>(F.getParent());
assert(SectionMap.contains(RelocationSec) &&
"Expected containing csect to exist in map.");
SectionMap[RelocationSec]->Relocations.push_back(Reloc);
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index cff7ab5..f810368 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -364,6 +364,7 @@
#include "llvm/Transforms/Utils/MoveAutoInit.h"
#include "llvm/Transforms/Utils/NameAnonGlobals.h"
#include "llvm/Transforms/Utils/PredicateInfo.h"
+#include "llvm/Transforms/Utils/ProfileVerify.h"
#include "llvm/Transforms/Utils/RelLookupTableConverter.h"
#include "llvm/Transforms/Utils/StripGCRelocates.h"
#include "llvm/Transforms/Utils/StripNonLineTableDebugInfo.h"
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index caa78b6..fd89583 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -119,7 +119,6 @@ MODULE_PASS("module-inline", ModuleInlinerPass())
MODULE_PASS("name-anon-globals", NameAnonGlobalPass())
MODULE_PASS("no-op-module", NoOpModulePass())
MODULE_PASS("nsan", NumericalStabilitySanitizerPass())
-MODULE_PASS("objc-arc-apelim", ObjCARCAPElimPass())
MODULE_PASS("openmp-opt", OpenMPOptPass())
MODULE_PASS("openmp-opt-postlink",
OpenMPOptPass(ThinOrFullLTOPhase::FullLTOPostLink))
@@ -520,6 +519,8 @@ FUNCTION_PASS("print<regions>", RegionInfoPrinterPass(errs()))
FUNCTION_PASS("print<scalar-evolution>", ScalarEvolutionPrinterPass(errs()))
FUNCTION_PASS("print<stack-safety-local>", StackSafetyPrinterPass(errs()))
FUNCTION_PASS("print<uniformity>", UniformityInfoPrinterPass(errs()))
+FUNCTION_PASS("prof-inject", ProfileInjectorPass())
+FUNCTION_PASS("prof-verify", ProfileVerifierPass())
FUNCTION_PASS("reassociate", ReassociatePass())
FUNCTION_PASS("redundant-dbg-inst-elim", RedundantDbgInstEliminationPass())
FUNCTION_PASS("replace-with-veclib", ReplaceWithVeclib())
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index 5c7b9e0..886add7 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -1295,7 +1295,7 @@ Error IndexedInstrProfReader::readHeader() {
// Writer first writes the length of compressed string, and then the actual
// content.
const char *VTableNamePtr = (const char *)Ptr;
- if (VTableNamePtr > (const char *)DataBuffer->getBufferEnd())
+ if (VTableNamePtr > DataBuffer->getBufferEnd())
return make_error<InstrProfError>(instrprof_error::truncated);
VTableName = StringRef(VTableNamePtr, CompressedVTableNamesLen);
diff --git a/llvm/lib/Support/AArch64AttributeParser.cpp b/llvm/lib/Support/AArch64AttributeParser.cpp
index c675ef2..eed8dba 100644
--- a/llvm/lib/Support/AArch64AttributeParser.cpp
+++ b/llvm/lib/Support/AArch64AttributeParser.cpp
@@ -8,6 +8,7 @@
//===---------------------------------------------------------------------===//
#include "llvm/Support/AArch64AttributeParser.h"
+#include "llvm/Support/AArch64BuildAttributes.h"
std::vector<llvm::SubsectionAndTagToTagName> &
llvm::AArch64AttributeParser::returnTagsNamesMap() {
@@ -19,3 +20,29 @@ llvm::AArch64AttributeParser::returnTagsNamesMap() {
{"aeabi_feature_and_bits", 2, "Tag_Feature_GCS"}};
return TagsNamesMap;
}
+
+llvm::AArch64BuildAttrSubsections llvm::extractBuildAttributesSubsections(
+ const llvm::AArch64AttributeParser &Attributes) {
+
+ llvm::AArch64BuildAttrSubsections SubSections;
+ auto GetPauthValue = [&Attributes](unsigned Tag) {
+ return Attributes.getAttributeValue("aeabi_pauthabi", Tag).value_or(0);
+ };
+ SubSections.Pauth.TagPlatform =
+ GetPauthValue(llvm::AArch64BuildAttributes::TAG_PAUTH_PLATFORM);
+ SubSections.Pauth.TagSchema =
+ GetPauthValue(llvm::AArch64BuildAttributes::TAG_PAUTH_SCHEMA);
+
+ auto GetFeatureValue = [&Attributes](unsigned Tag) {
+ return Attributes.getAttributeValue("aeabi_feature_and_bits", Tag)
+ .value_or(0);
+ };
+ SubSections.AndFeatures |=
+ GetFeatureValue(llvm::AArch64BuildAttributes::TAG_FEATURE_BTI);
+ SubSections.AndFeatures |=
+ GetFeatureValue(llvm::AArch64BuildAttributes::TAG_FEATURE_PAC) << 1;
+ SubSections.AndFeatures |=
+ GetFeatureValue(llvm::AArch64BuildAttributes::TAG_FEATURE_GCS) << 2;
+
+ return SubSections;
+}
diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp
index d5c3cba..8491633 100644
--- a/llvm/lib/Support/CommandLine.cpp
+++ b/llvm/lib/Support/CommandLine.cpp
@@ -68,11 +68,19 @@ template class LLVM_EXPORT_TEMPLATE basic_parser<float>;
template class LLVM_EXPORT_TEMPLATE basic_parser<std::string>;
template class LLVM_EXPORT_TEMPLATE basic_parser<char>;
-template class opt<unsigned>;
-template class opt<int>;
-template class opt<std::string>;
-template class opt<char>;
-template class opt<bool>;
+#if !(defined(LLVM_ENABLE_LLVM_EXPORT_ANNOTATIONS) && defined(_MSC_VER))
+// Only instantiate opt<std::string> when not building a Windows DLL. When
+// exporting opt<std::string>, MSVC implicitly exports symbols for
+// std::basic_string through transitive inheritance via std::string. These
+// symbols may appear in clients, leading to duplicate symbol conflicts.
+template class LLVM_EXPORT_TEMPLATE opt<std::string>;
+#endif
+
+template class LLVM_EXPORT_TEMPLATE opt<bool>;
+template class LLVM_EXPORT_TEMPLATE opt<char>;
+template class LLVM_EXPORT_TEMPLATE opt<int>;
+template class LLVM_EXPORT_TEMPLATE opt<unsigned>;
+
} // namespace cl
} // namespace llvm
@@ -95,6 +103,15 @@ void parser<float>::anchor() {}
void parser<std::string>::anchor() {}
void parser<char>::anchor() {}
+// These anchor functions instantiate opt<T> and reference its virtual
+// destructor to ensure MSVC exports the corresponding vtable and typeinfo when
+// building a Windows DLL. Without an explicit reference, MSVC may omit the
+// instantiation at link time even if it is marked DLL-export.
+void opt_bool_anchor() { opt<bool> anchor{""}; }
+void opt_char_anchor() { opt<char> anchor{""}; }
+void opt_int_anchor() { opt<int> anchor{""}; }
+void opt_unsigned_anchor() { opt<unsigned> anchor{""}; }
+
//===----------------------------------------------------------------------===//
const static size_t DefaultPad = 2;
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index c4b43e1..c52487a 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -176,6 +176,9 @@ public:
std::optional<AArch64PACKey::ID> PACKey,
uint64_t PACDisc, Register PACAddrDisc);
+ // Emit the sequence for PAC.
+ void emitPtrauthSign(const MachineInstr *MI);
+
// Emit the sequence to compute the discriminator.
//
// The returned register is either unmodified AddrDisc or ScratchReg.
@@ -2175,6 +2178,37 @@ void AArch64AsmPrinter::emitPtrauthAuthResign(
OutStreamer->emitLabel(EndSym);
}
+void AArch64AsmPrinter::emitPtrauthSign(const MachineInstr *MI) {
+ Register Val = MI->getOperand(1).getReg();
+ auto Key = (AArch64PACKey::ID)MI->getOperand(2).getImm();
+ uint64_t Disc = MI->getOperand(3).getImm();
+ Register AddrDisc = MI->getOperand(4).getReg();
+ bool AddrDiscKilled = MI->getOperand(4).isKill();
+
+ // As long as at least one of Val and AddrDisc is in GPR64noip, a scratch
+ // register is available.
+ Register ScratchReg = Val == AArch64::X16 ? AArch64::X17 : AArch64::X16;
+ assert(ScratchReg != AddrDisc &&
+ "Neither X16 nor X17 is available as a scratch register");
+
+ // Compute pac discriminator
+ assert(isUInt<16>(Disc));
+ Register DiscReg = emitPtrauthDiscriminator(
+ Disc, AddrDisc, ScratchReg, /*MayUseAddrAsScratch=*/AddrDiscKilled);
+ bool IsZeroDisc = DiscReg == AArch64::XZR;
+ unsigned Opc = getPACOpcodeForKey(Key, IsZeroDisc);
+
+ // paciza x16 ; if IsZeroDisc
+ // pacia x16, x17 ; if !IsZeroDisc
+ MCInst PACInst;
+ PACInst.setOpcode(Opc);
+ PACInst.addOperand(MCOperand::createReg(Val));
+ PACInst.addOperand(MCOperand::createReg(Val));
+ if (!IsZeroDisc)
+ PACInst.addOperand(MCOperand::createReg(DiscReg));
+ EmitToStreamer(*OutStreamer, PACInst);
+}
+
void AArch64AsmPrinter::emitPtrauthBranch(const MachineInstr *MI) {
bool IsCall = MI->getOpcode() == AArch64::BLRA;
unsigned BrTarget = MI->getOperand(0).getReg();
@@ -2890,6 +2924,10 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
MI->getOperand(4).getImm(), MI->getOperand(5).getReg());
return;
+ case AArch64::PAC:
+ emitPtrauthSign(MI);
+ return;
+
case AArch64::LOADauthptrstatic:
LowerLOADauthptrstatic(*MI);
return;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 02ee517..7b49754 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -164,6 +164,9 @@ static cl::opt<bool> UseFEATCPACodegen(
/// Value type used for condition codes.
static const MVT MVT_CC = MVT::i32;
+/// Value type used for NZCV flags.
+static constexpr MVT FlagsVT = MVT::i32;
+
static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,
AArch64::X3, AArch64::X4, AArch64::X5,
AArch64::X6, AArch64::X7};
@@ -3098,6 +3101,83 @@ AArch64TargetLowering::EmitGetSMESaveSize(MachineInstr &MI,
return BB;
}
+// Helper function to find the instruction that defined a virtual register.
+// If unable to find such instruction, returns nullptr.
+static const MachineInstr *stripVRegCopies(const MachineRegisterInfo &MRI,
+ Register Reg) {
+ while (Reg.isVirtual()) {
+ MachineInstr *DefMI = MRI.getVRegDef(Reg);
+ assert(DefMI && "Virtual register definition not found");
+ unsigned Opcode = DefMI->getOpcode();
+
+ if (Opcode == AArch64::COPY) {
+ Reg = DefMI->getOperand(1).getReg();
+ // Vreg is defined by copying from physreg.
+ if (Reg.isPhysical())
+ return DefMI;
+ continue;
+ }
+ if (Opcode == AArch64::SUBREG_TO_REG) {
+ Reg = DefMI->getOperand(2).getReg();
+ continue;
+ }
+
+ return DefMI;
+ }
+ return nullptr;
+}
+
+void AArch64TargetLowering::fixupPtrauthDiscriminator(
+ MachineInstr &MI, MachineBasicBlock *BB, MachineOperand &IntDiscOp,
+ MachineOperand &AddrDiscOp, const TargetRegisterClass *AddrDiscRC) const {
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ Register AddrDisc = AddrDiscOp.getReg();
+ int64_t IntDisc = IntDiscOp.getImm();
+ assert(IntDisc == 0 && "Blend components are already expanded");
+
+ const MachineInstr *DiscMI = stripVRegCopies(MRI, AddrDisc);
+ if (DiscMI) {
+ switch (DiscMI->getOpcode()) {
+ case AArch64::MOVKXi:
+ // blend(addr, imm) which is lowered as "MOVK addr, #imm, #48".
+ // #imm should be an immediate and not a global symbol, for example.
+ if (DiscMI->getOperand(2).isImm() &&
+ DiscMI->getOperand(3).getImm() == 48) {
+ AddrDisc = DiscMI->getOperand(1).getReg();
+ IntDisc = DiscMI->getOperand(2).getImm();
+ }
+ break;
+ case AArch64::MOVi32imm:
+ case AArch64::MOVi64imm:
+ // Small immediate integer constant passed via VReg.
+ if (DiscMI->getOperand(1).isImm() &&
+ isUInt<16>(DiscMI->getOperand(1).getImm())) {
+ AddrDisc = AArch64::NoRegister;
+ IntDisc = DiscMI->getOperand(1).getImm();
+ }
+ break;
+ }
+ }
+
+ // For uniformity, always use NoRegister, as XZR is not necessarily contained
+ // in the requested register class.
+ if (AddrDisc == AArch64::XZR)
+ AddrDisc = AArch64::NoRegister;
+
+ // Make sure AddrDisc operand respects the register class imposed by MI.
+ if (AddrDisc && MRI.getRegClass(AddrDisc) != AddrDiscRC) {
+ Register TmpReg = MRI.createVirtualRegister(AddrDiscRC);
+ BuildMI(*BB, MI, DL, TII->get(AArch64::COPY), TmpReg).addReg(AddrDisc);
+ AddrDisc = TmpReg;
+ }
+
+ AddrDiscOp.setReg(AddrDisc);
+ IntDiscOp.setImm(IntDisc);
+}
+
MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
MachineInstr &MI, MachineBasicBlock *BB) const {
@@ -3196,6 +3276,11 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true);
case AArch64::MOVT_TIZ_PSEUDO:
return EmitZTInstr(MI, BB, AArch64::MOVT_TIZ, /*Op0IsDef=*/true);
+
+ case AArch64::PAC:
+ fixupPtrauthDiscriminator(MI, BB, MI.getOperand(3), MI.getOperand(4),
+ &AArch64::GPR64noipRegClass);
+ return BB;
}
}
@@ -3451,7 +3536,7 @@ static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &DL,
}
unsigned Opcode =
IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP;
- return DAG.getNode(Opcode, DL, {MVT::i32, MVT::Other}, {Chain, LHS, RHS});
+ return DAG.getNode(Opcode, DL, {FlagsVT, MVT::Other}, {Chain, LHS, RHS});
}
static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
@@ -3465,7 +3550,7 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
}
- return DAG.getNode(AArch64ISD::FCMP, DL, MVT::i32, LHS, RHS);
+ return DAG.getNode(AArch64ISD::FCMP, DL, FlagsVT, LHS, RHS);
}
// The CMP instruction is just an alias for SUBS, and representing it as
@@ -3490,7 +3575,7 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
// (a.k.a. ANDS) except that the flags are only guaranteed to work for one
// of the signed comparisons.
const SDValue ANDSNode =
- DAG.getNode(AArch64ISD::ANDS, DL, DAG.getVTList(VT, MVT_CC),
+ DAG.getNode(AArch64ISD::ANDS, DL, DAG.getVTList(VT, FlagsVT),
LHS.getOperand(0), LHS.getOperand(1));
// Replace all users of (and X, Y) with newly generated (ands X, Y)
DAG.ReplaceAllUsesWith(LHS, ANDSNode);
@@ -3501,7 +3586,7 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
}
}
- return DAG.getNode(Opcode, DL, DAG.getVTList(VT, MVT_CC), LHS, RHS)
+ return DAG.getNode(Opcode, DL, DAG.getVTList(VT, FlagsVT), LHS, RHS)
.getValue(1);
}
@@ -3597,7 +3682,7 @@ static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
- return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
+ return DAG.getNode(Opcode, DL, FlagsVT, LHS, RHS, NZCVOp, Condition, CCOp);
}
/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
@@ -4036,7 +4121,7 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
// Check that the result fits into a 32-bit integer.
- SDVTList VTs = DAG.getVTList(MVT::i64, MVT_CC);
+ SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
if (IsSigned) {
// cmp xreg, wreg, sxtw
SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
@@ -4059,12 +4144,12 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
DAG.getConstant(63, DL, MVT::i64));
// It is important that LowerBits is last, otherwise the arithmetic
// shift will not be folded into the compare (SUBS).
- SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
+ SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
.getValue(1);
} else {
SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
- SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
+ SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
Overflow =
DAG.getNode(AArch64ISD::SUBS, DL, VTs,
DAG.getConstant(0, DL, MVT::i64),
@@ -4075,7 +4160,7 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
} // switch (...)
if (Opc) {
- SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
+ SDVTList VTs = DAG.getVTList(Op->getValueType(0), FlagsVT);
// Emit the AArch64 operation with overflow check.
Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
@@ -4177,7 +4262,7 @@ static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert) {
SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value;
SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT);
SDValue Cmp =
- DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::Glue), Op0, Op1);
+ DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT), Op0, Op1);
return Cmp.getValue(1);
}
@@ -4220,16 +4305,15 @@ static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG,
SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry);
SDLoc DL(Op);
- SDVTList VTs = DAG.getVTList(VT0, VT1);
- SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, MVT::Glue), OpLHS,
+ SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, FlagsVT), OpLHS,
OpRHS, OpCarryIn);
SDValue OutFlag =
IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG)
: carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry);
- return DAG.getNode(ISD::MERGE_VALUES, DL, VTs, Sum, OutFlag);
+ return DAG.getMergeValues({Sum, OutFlag}, DL);
}
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
@@ -4254,8 +4338,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
Overflow =
DAG.getNode(AArch64ISD::CSEL, DL, MVT::i32, FVal, TVal, CCVal, Overflow);
- SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
- return DAG.getNode(ISD::MERGE_VALUES, DL, VTs, Value, Overflow);
+ return DAG.getMergeValues({Value, Overflow}, DL);
}
// Prefetch operands are:
@@ -6813,7 +6896,8 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64));
SDValue Result = DAG.getMemIntrinsicNode(
AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
- {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
+ {StoreNode->getChain(), DAG.getBitcast(MVT::v2i64, Lo),
+ DAG.getBitcast(MVT::v2i64, Hi), StoreNode->getBasePtr()},
StoreNode->getMemoryVT(), StoreNode->getMemOperand());
return Result;
}
@@ -7037,9 +7121,8 @@ SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
Op.getOperand(0));
// Generate SUBS & CSEL.
- SDValue Cmp =
- DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
- Op.getOperand(0), DAG.getConstant(0, DL, VT));
+ SDValue Cmp = DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT),
+ Op.getOperand(0), DAG.getConstant(0, DL, VT));
return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
Cmp.getValue(1));
@@ -11108,7 +11191,7 @@ SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
SDValue Carry = Op.getOperand(2);
// SBCS uses a carry not a borrow so the carry flag should be inverted first.
SDValue InvCarry = valueToCarryFlag(Carry, DAG, true);
- SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, MVT::Glue),
+ SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, FlagsVT),
LHS, RHS, InvCarry);
EVT OpVT = Op.getValueType();
@@ -12441,10 +12524,10 @@ SDValue AArch64TargetLowering::LowerAsmOutputForConstraint(
// Get NZCV register. Only update chain when copyfrom is glued.
if (Glue.getNode()) {
- Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32, Glue);
+ Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, FlagsVT, Glue);
Chain = Glue.getValue(1);
} else
- Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32);
+ Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, FlagsVT);
// Extract CC code.
SDValue CC = getSETCC(Cond, Glue, DL, DAG);
@@ -18020,11 +18103,14 @@ bool AArch64TargetLowering::shouldFoldConstantShiftPairToMask(
unsigned ShlAmt = C2->getZExtValue();
if (auto ShouldADD = *N->user_begin();
ShouldADD->getOpcode() == ISD::ADD && ShouldADD->hasOneUse()) {
- if (auto ShouldLOAD = dyn_cast<LoadSDNode>(*ShouldADD->user_begin())) {
- unsigned ByteVT = ShouldLOAD->getMemoryVT().getSizeInBits() / 8;
- if ((1ULL << ShlAmt) == ByteVT &&
- isIndexedLoadLegal(ISD::PRE_INC, ShouldLOAD->getMemoryVT()))
- return false;
+ if (auto Load = dyn_cast<LoadSDNode>(*ShouldADD->user_begin())) {
+ EVT MemVT = Load->getMemoryVT();
+
+ if (Load->getValueType(0).isScalableVector())
+ return (8ULL << ShlAmt) != MemVT.getScalarSizeInBits();
+
+ if (isIndexedLoadLegal(ISD::PRE_INC, MemVT))
+ return (8ULL << ShlAmt) != MemVT.getFixedSizeInBits();
}
}
}
@@ -18593,7 +18679,7 @@ AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
Created.push_back(And.getNode());
} else {
SDValue CCVal = DAG.getConstant(AArch64CC::MI, DL, MVT_CC);
- SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+ SDVTList VTs = DAG.getVTList(VT, FlagsVT);
SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0);
SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
@@ -19482,10 +19568,10 @@ static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG) {
// can select to CCMN to avoid the extra mov
SDValue AbsOp1 =
DAG.getConstant(Op1->getAPIntValue().abs(), DL, Op1->getValueType(0));
- CCmp = DAG.getNode(AArch64ISD::CCMN, DL, MVT_CC, Cmp1.getOperand(0), AbsOp1,
- NZCVOp, Condition, Cmp0);
+ CCmp = DAG.getNode(AArch64ISD::CCMN, DL, FlagsVT, Cmp1.getOperand(0),
+ AbsOp1, NZCVOp, Condition, Cmp0);
} else {
- CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, Cmp1.getOperand(0),
+ CCmp = DAG.getNode(AArch64ISD::CCMP, DL, FlagsVT, Cmp1.getOperand(0),
Cmp1.getOperand(1), NZCVOp, Condition, Cmp0);
}
return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0),
@@ -25134,8 +25220,9 @@ static SDValue reassociateCSELOperandsForCSE(SDNode *N, SelectionDAG &DAG) {
if (!TReassocOp && !FReassocOp)
return SDValue();
- SDValue NewCmp = DAG.getNode(AArch64ISD::SUBS, SDLoc(SubsNode),
- DAG.getVTList(VT, MVT_CC), CmpOpOther, SubsOp);
+ SDValue NewCmp =
+ DAG.getNode(AArch64ISD::SUBS, SDLoc(SubsNode),
+ DAG.getVTList(VT, FlagsVT), CmpOpOther, SubsOp);
auto Reassociate = [&](SDValue ReassocOp, unsigned OpNum) {
if (!ReassocOp)
@@ -27161,7 +27248,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
: AArch64SysReg::RNDRRS);
SDLoc DL(N);
SDValue A = DAG.getNode(
- AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, MVT::i32, MVT::Other),
+ AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, FlagsVT, MVT::Other),
N->getOperand(0), DAG.getConstant(Register, DL, MVT::i32));
SDValue B = DAG.getNode(
AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
@@ -27907,16 +27994,16 @@ void AArch64TargetLowering::ReplaceNodeResults(
MemVT.getScalarSizeInBits() == 32u ||
MemVT.getScalarSizeInBits() == 64u)) {
+ EVT HalfVT = MemVT.getHalfNumVectorElementsVT(*DAG.getContext());
SDValue Result = DAG.getMemIntrinsicNode(
AArch64ISD::LDNP, SDLoc(N),
- DAG.getVTList({MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
- MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
- MVT::Other}),
+ DAG.getVTList({MVT::v2i64, MVT::v2i64, MVT::Other}),
{LoadNode->getChain(), LoadNode->getBasePtr()},
LoadNode->getMemoryVT(), LoadNode->getMemOperand());
SDValue Pair = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), MemVT,
- Result.getValue(0), Result.getValue(1));
+ DAG.getBitcast(HalfVT, Result.getValue(0)),
+ DAG.getBitcast(HalfVT, Result.getValue(1)));
Results.append({Pair, Result.getValue(2) /* Chain */});
return;
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index d8403c2..95d0e3b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -182,6 +182,13 @@ public:
MachineBasicBlock *EmitGetSMESaveSize(MachineInstr &MI,
MachineBasicBlock *BB) const;
+ /// Replace (0, vreg) discriminator components with the operands of blend
+ /// or with (immediate, NoRegister) when possible.
+ void fixupPtrauthDiscriminator(MachineInstr &MI, MachineBasicBlock *BB,
+ MachineOperand &IntDiscOp,
+ MachineOperand &AddrDiscOp,
+ const TargetRegisterClass *AddrDiscRC) const;
+
MachineBasicBlock *
EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *MBB) const override;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index bc57537..8685d7a0 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -20,7 +20,6 @@
#include "Utils/AArch64BaseInfo.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/CFIInstBuilder.h"
#include "llvm/CodeGen/LivePhysRegs.h"
@@ -36,7 +35,6 @@
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
#include "llvm/CodeGen/StackMaps.h"
-#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/DebugInfoMetadata.h"
@@ -533,8 +531,9 @@ bool AArch64InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB,
MBP.LHS = LastInst->getOperand(0);
MBP.RHS = MachineOperand::CreateImm(0);
- MBP.Predicate = LastOpc == AArch64::CBNZX ? MachineBranchPredicate::PRED_NE
- : MachineBranchPredicate::PRED_EQ;
+ MBP.Predicate = (LastOpc == AArch64::CBNZX || LastOpc == AArch64::CBNZW)
+ ? MachineBranchPredicate::PRED_NE
+ : MachineBranchPredicate::PRED_EQ;
return false;
}
@@ -7353,9 +7352,6 @@ bool AArch64InstrInfo::isThroughputPattern(unsigned Pattern) const {
case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
- case AArch64MachineCombinerPattern::GATHER_LANE_i32:
- case AArch64MachineCombinerPattern::GATHER_LANE_i16:
- case AArch64MachineCombinerPattern::GATHER_LANE_i8:
return true;
} // end switch (Pattern)
return false;
@@ -7396,252 +7392,11 @@ static bool getMiscPatterns(MachineInstr &Root,
return false;
}
-static bool getGatherPattern(MachineInstr &Root,
- SmallVectorImpl<unsigned> &Patterns,
- unsigned LoadLaneOpCode, unsigned NumLanes) {
- const MachineFunction *MF = Root.getMF();
-
- // Early exit if optimizing for size.
- if (MF->getFunction().hasMinSize())
- return false;
-
- const MachineRegisterInfo &MRI = MF->getRegInfo();
- const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
-
- // The root of the pattern must load into the last lane of the vector.
- if (Root.getOperand(2).getImm() != NumLanes - 1)
- return false;
-
- // Check that we have load into all lanes except lane 0.
- // For each load we also want to check that:
- // 1. It has a single non-debug use (since we will be replacing the virtual
- // register)
- // 2. That the addressing mode only uses a single offset register.
- auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
- auto Range = llvm::seq<unsigned>(1, NumLanes - 1);
- SmallSet<unsigned, 4> RemainingLanes(Range.begin(), Range.end());
- while (!RemainingLanes.empty() && CurrInstr &&
- CurrInstr->getOpcode() == LoadLaneOpCode &&
- MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg()) &&
- CurrInstr->getNumOperands() == 4) {
- RemainingLanes.erase(CurrInstr->getOperand(2).getImm());
- CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
- }
-
- if (!RemainingLanes.empty())
- return false;
-
- // Match the SUBREG_TO_REG sequence.
- if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG)
- return false;
-
- // Verify that the subreg to reg loads an integer into the first lane.
- auto Lane0LoadReg = CurrInstr->getOperand(2).getReg();
- unsigned SingleLaneSizeInBits = 128 / NumLanes;
- if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != SingleLaneSizeInBits)
- return false;
-
- // Verify that it also has a single non debug use.
- if (!MRI.hasOneNonDBGUse(Lane0LoadReg))
- return false;
-
- switch (NumLanes) {
- case 4:
- Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i32);
- break;
- case 8:
- Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i16);
- break;
- case 16:
- Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i8);
- break;
- default:
- llvm_unreachable("Got bad number of lanes for gather pattern.");
- }
-
- return true;
-}
-
-/// Search for patterns where we use LD1 instructions to load into
-/// separate lanes of an 128 bit Neon register. We can increase Memory Level
-/// Parallelism by loading into 2 Neon registers instead.
-static bool getLoadPatterns(MachineInstr &Root,
- SmallVectorImpl<unsigned> &Patterns) {
-
- // The pattern searches for loads into single lanes.
- switch (Root.getOpcode()) {
- case AArch64::LD1i32:
- return getGatherPattern(Root, Patterns, Root.getOpcode(), 4);
- case AArch64::LD1i16:
- return getGatherPattern(Root, Patterns, Root.getOpcode(), 8);
- case AArch64::LD1i8:
- return getGatherPattern(Root, Patterns, Root.getOpcode(), 16);
- default:
- return false;
- }
-}
-
-static void
-generateGatherPattern(MachineInstr &Root,
- SmallVectorImpl<MachineInstr *> &InsInstrs,
- SmallVectorImpl<MachineInstr *> &DelInstrs,
- DenseMap<Register, unsigned> &InstrIdxForVirtReg,
- unsigned Pattern, unsigned NumLanes) {
-
- MachineFunction &MF = *Root.getParent()->getParent();
- MachineRegisterInfo &MRI = MF.getRegInfo();
- const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
-
- // Gather the initial load instructions to build the pattern
- SmallVector<MachineInstr *, 16> LoadToLaneInstrs;
- MachineInstr *CurrInstr = &Root;
- for (unsigned i = 0; i < NumLanes - 1; ++i) {
- LoadToLaneInstrs.push_back(CurrInstr);
- CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
- }
-
- // Sort the load instructions according to the lane.
- llvm::sort(LoadToLaneInstrs,
- [](const MachineInstr *A, const MachineInstr *B) {
- return A->getOperand(2).getImm() > B->getOperand(2).getImm();
- });
-
- MachineInstr *SubregToReg = CurrInstr;
- LoadToLaneInstrs.push_back(
- MRI.getUniqueVRegDef(SubregToReg->getOperand(2).getReg()));
- auto LoadToLaneInstrsAscending = llvm::reverse(LoadToLaneInstrs);
-
- const TargetRegisterClass *FPR128RegClass =
- MRI.getRegClass(Root.getOperand(0).getReg());
-
- auto LoadLaneToRegister = [&](MachineInstr *OriginalInstr,
- Register SrcRegister, unsigned Lane,
- Register OffsetRegister) {
- auto NewRegister = MRI.createVirtualRegister(FPR128RegClass);
- MachineInstrBuilder LoadIndexIntoRegister =
- BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()),
- NewRegister)
- .addReg(SrcRegister)
- .addImm(Lane)
- .addReg(OffsetRegister, getKillRegState(true));
- InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size()));
- InsInstrs.push_back(LoadIndexIntoRegister);
- return NewRegister;
- };
-
- // Helper to create load instruction based on opcode
- auto CreateLoadInstruction = [&](unsigned NumLanes, Register DestReg,
- Register OffsetReg) -> MachineInstrBuilder {
- unsigned Opcode;
- switch (NumLanes) {
- case 4:
- Opcode = AArch64::LDRSui;
- break;
- case 8:
- Opcode = AArch64::LDRHui;
- break;
- case 16:
- Opcode = AArch64::LDRBui;
- break;
- default:
- llvm_unreachable(
- "Got unsupported number of lanes in machine-combiner gather pattern");
- }
- // Immediate offset load
- return BuildMI(MF, MIMetadata(Root), TII->get(Opcode), DestReg)
- .addReg(OffsetReg)
- .addImm(0); // immediate offset
- };
-
- // Load the remaining lanes into register 0.
- auto LanesToLoadToReg0 =
- llvm::make_range(LoadToLaneInstrsAscending.begin() + 1,
- LoadToLaneInstrsAscending.begin() + NumLanes / 2);
- auto PrevReg = SubregToReg->getOperand(0).getReg();
- for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg0)) {
- PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1,
- LoadInstr->getOperand(3).getReg());
- DelInstrs.push_back(LoadInstr);
- }
- auto LastLoadReg0 = PrevReg;
-
- // First load into register 1. Perform a LDRSui to zero out the upper lanes in
- // a single instruction.
- auto Lane0Load = *LoadToLaneInstrsAscending.begin();
- auto OriginalSplitLoad =
- *std::next(LoadToLaneInstrsAscending.begin(), NumLanes / 2);
- auto DestRegForMiddleIndex = MRI.createVirtualRegister(
- MRI.getRegClass(Lane0Load->getOperand(0).getReg()));
-
- MachineInstrBuilder MiddleIndexLoadInstr =
- CreateLoadInstruction(NumLanes, DestRegForMiddleIndex,
- OriginalSplitLoad->getOperand(3).getReg());
-
- InstrIdxForVirtReg.insert(
- std::make_pair(DestRegForMiddleIndex, InsInstrs.size()));
- InsInstrs.push_back(MiddleIndexLoadInstr);
- DelInstrs.push_back(OriginalSplitLoad);
-
- // Subreg To Reg instruction for register 1.
- auto DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass);
- unsigned SubregType;
- switch (NumLanes) {
- case 4:
- SubregType = AArch64::ssub;
- break;
- case 8:
- SubregType = AArch64::hsub;
- break;
- case 16:
- SubregType = AArch64::bsub;
- break;
- default:
- llvm_unreachable(
- "Got invalid NumLanes for machine-combiner gather pattern");
- }
-
- auto SubRegToRegInstr =
- BuildMI(MF, MIMetadata(Root), TII->get(SubregToReg->getOpcode()),
- DestRegForSubregToReg)
- .addImm(0)
- .addReg(DestRegForMiddleIndex, getKillRegState(true))
- .addImm(SubregType);
- InstrIdxForVirtReg.insert(
- std::make_pair(DestRegForSubregToReg, InsInstrs.size()));
- InsInstrs.push_back(SubRegToRegInstr);
-
- // Load remaining lanes into register 1.
- auto LanesToLoadToReg1 =
- llvm::make_range(LoadToLaneInstrsAscending.begin() + NumLanes / 2 + 1,
- LoadToLaneInstrsAscending.end());
- PrevReg = SubRegToRegInstr->getOperand(0).getReg();
- for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg1)) {
- PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1,
- LoadInstr->getOperand(3).getReg());
- if (Index == NumLanes / 2 - 2) {
- break;
- }
- DelInstrs.push_back(LoadInstr);
- }
- auto LastLoadReg1 = PrevReg;
-
- // Create the final zip instruction to combine the results.
- MachineInstrBuilder ZipInstr =
- BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64),
- Root.getOperand(0).getReg())
- .addReg(LastLoadReg0)
- .addReg(LastLoadReg1);
- InsInstrs.push_back(ZipInstr);
-}
-
CombinerObjective
AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const {
switch (Pattern) {
case AArch64MachineCombinerPattern::SUBADD_OP1:
case AArch64MachineCombinerPattern::SUBADD_OP2:
- case AArch64MachineCombinerPattern::GATHER_LANE_i32:
- case AArch64MachineCombinerPattern::GATHER_LANE_i16:
- case AArch64MachineCombinerPattern::GATHER_LANE_i8:
return CombinerObjective::MustReduceDepth;
default:
return TargetInstrInfo::getCombinerObjective(Pattern);
@@ -7671,10 +7426,6 @@ bool AArch64InstrInfo::getMachineCombinerPatterns(
if (getMiscPatterns(Root, Patterns))
return true;
- // Load patterns
- if (getLoadPatterns(Root, Patterns))
- return true;
-
return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
DoRegPressureReduce);
}
@@ -8930,21 +8681,6 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
break;
}
- case AArch64MachineCombinerPattern::GATHER_LANE_i32: {
- generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
- Pattern, 4);
- break;
- }
- case AArch64MachineCombinerPattern::GATHER_LANE_i16: {
- generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
- Pattern, 8);
- break;
- }
- case AArch64MachineCombinerPattern::GATHER_LANE_i8: {
- generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
- Pattern, 16);
- break;
- }
} // end switch (Pattern)
// Record MUL and ADD/SUB for deletion
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 02734866..7c255da 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -172,10 +172,6 @@ enum AArch64MachineCombinerPattern : unsigned {
FMULv8i16_indexed_OP2,
FNMADD,
-
- GATHER_LANE_i32,
- GATHER_LANE_i16,
- GATHER_LANE_i8
};
class AArch64InstrInfo final : public AArch64GenInstrInfo {
const AArch64RegisterInfo RI;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 9f8a257..07cacfa 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -430,26 +430,27 @@ def UseWzrToVecMove : Predicate<"Subtarget->useWzrToVecMove()">;
def SDTBinaryArithWithFlagsOut : SDTypeProfile<2, 2,
[SDTCisSameAs<0, 2>,
SDTCisSameAs<0, 3>,
- SDTCisInt<0>, SDTCisVT<1, i32>]>;
+ SDTCisInt<0>,
+ SDTCisVT<1, FlagsVT>]>;
// SDTBinaryArithWithFlagsIn - RES1, FLAGS = op LHS, RHS, FLAGS
def SDTBinaryArithWithFlagsIn : SDTypeProfile<1, 3,
[SDTCisSameAs<0, 1>,
SDTCisSameAs<0, 2>,
SDTCisInt<0>,
- SDTCisVT<3, i32>]>;
+ SDTCisVT<3, FlagsVT>]>;
// SDTBinaryArithWithFlagsInOut - RES1, FLAGS = op LHS, RHS, FLAGS
def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3,
[SDTCisSameAs<0, 2>,
SDTCisSameAs<0, 3>,
SDTCisInt<0>,
- SDTCisVT<1, i32>,
- SDTCisVT<4, i32>]>;
+ SDTCisVT<1, FlagsVT>,
+ SDTCisVT<4, FlagsVT>]>;
def SDT_AArch64Brcond : SDTypeProfile<0, 3,
[SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>,
- SDTCisVT<2, i32>]>;
+ SDTCisVT<2, FlagsVT>]>;
def SDT_AArch64cbz : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisVT<1, OtherVT>]>;
def SDT_AArch64tbz : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>,
SDTCisVT<2, OtherVT>]>;
@@ -458,22 +459,22 @@ def SDT_AArch64CSel : SDTypeProfile<1, 4,
[SDTCisSameAs<0, 1>,
SDTCisSameAs<0, 2>,
SDTCisInt<3>,
- SDTCisVT<4, i32>]>;
+ SDTCisVT<4, FlagsVT>]>;
def SDT_AArch64CCMP : SDTypeProfile<1, 5,
- [SDTCisVT<0, i32>,
+ [SDTCisVT<0, FlagsVT>,
SDTCisInt<1>,
SDTCisSameAs<1, 2>,
SDTCisInt<3>,
SDTCisInt<4>,
SDTCisVT<5, i32>]>;
def SDT_AArch64FCCMP : SDTypeProfile<1, 5,
- [SDTCisVT<0, i32>,
+ [SDTCisVT<0, FlagsVT>,
SDTCisFP<1>,
SDTCisSameAs<1, 2>,
SDTCisInt<3>,
SDTCisInt<4>,
SDTCisVT<5, i32>]>;
-def SDT_AArch64FCmp : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
+def SDT_AArch64FCmp : SDTypeProfile<1, 2, [SDTCisVT<0, FlagsVT>,
SDTCisFP<1>,
SDTCisSameAs<2, 1>]>;
def SDT_AArch64Rev : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>]>;
@@ -518,10 +519,10 @@ def SDT_AArch64uaddlp : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>;
def SDT_AArch64ldp : SDTypeProfile<2, 1, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
def SDT_AArch64ldiapp : SDTypeProfile<2, 1, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
-def SDT_AArch64ldnp : SDTypeProfile<2, 1, [SDTCisVT<0, v4i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
+def SDT_AArch64ldnp : SDTypeProfile<2, 1, [SDTCisVT<0, v2i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
def SDT_AArch64stp : SDTypeProfile<0, 3, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
def SDT_AArch64stilp : SDTypeProfile<0, 3, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
-def SDT_AArch64stnp : SDTypeProfile<0, 3, [SDTCisVT<0, v4i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
+def SDT_AArch64stnp : SDTypeProfile<0, 3, [SDTCisVT<0, v2i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
// Generates the general dynamic sequences, i.e.
// adrp x0, :tlsdesc:var
@@ -1124,10 +1125,10 @@ def AArch64probedalloca
SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
[SDNPHasChain, SDNPMayStore]>;
-// MRS, also sets the flags via a glue.
+// MRS, also sets the flags.
def AArch64mrs : SDNode<"AArch64ISD::MRS",
SDTypeProfile<2, 1, [SDTCisVT<0, i64>,
- SDTCisVT<1, i32>,
+ SDTCisVT<1, FlagsVT>,
SDTCisVT<2, i32>]>,
[SDNPHasChain]>;
@@ -2032,7 +2033,7 @@ let Predicates = [HasPAuth] in {
def DZB : SignAuthZero<prefix_z, 0b11, !strconcat(asm, "dzb"), op>;
}
- defm PAC : SignAuth<0b000, 0b010, "pac", int_ptrauth_sign>;
+ defm PAC : SignAuth<0b000, 0b010, "pac", null_frag>;
defm AUT : SignAuth<0b001, 0b011, "aut", null_frag>;
def XPACI : ClearAuth<0, "xpaci">;
@@ -2152,6 +2153,26 @@ let Predicates = [HasPAuth] in {
let Uses = [];
}
+ // PAC pseudo instruction. In AsmPrinter, it is expanded into an actual PAC*
+ // instruction immediately preceded by the discriminator computation.
+ // This enforces the expected immediate modifier is used for signing, even
+ // if an attacker is able to substitute AddrDisc.
+ def PAC : Pseudo<(outs GPR64:$SignedVal),
+ (ins GPR64:$Val, i32imm:$Key, i64imm:$Disc, GPR64noip:$AddrDisc),
+ [], "$SignedVal = $Val">, Sched<[WriteI, ReadI]> {
+ let isCodeGenOnly = 1;
+ let hasSideEffects = 0;
+ let mayStore = 0;
+ let mayLoad = 0;
+ let Size = 12;
+ let Defs = [X16, X17];
+ let usesCustomInserter = 1;
+ }
+
+ // A standalone pattern is used, so that literal 0 can be passed as $Disc.
+ def : Pat<(int_ptrauth_sign GPR64:$Val, timm:$Key, GPR64noip:$AddrDisc),
+ (PAC GPR64:$Val, $Key, 0, GPR64noip:$AddrDisc)>;
+
// AUT and re-PAC a value, using different keys/data.
// This directly manipulates x16/x17, which are the only registers that
// certain OSs guarantee are safe to use for sensitive operations.
@@ -3934,6 +3955,26 @@ defm LDRSW : LoadUI<0b10, 0, 0b10, GPR64, uimm12s4, "ldrsw",
def : Pat<(i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))),
(SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>;
+// load zero-extended i32, bitcast to f64
+def : Pat <(f64 (bitconvert (i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
+ (SUBREG_TO_REG (i64 0), (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
+
+// load zero-extended i16, bitcast to f64
+def : Pat <(f64 (bitconvert (i64 (zextloadi16 (am_indexed32 GPR64sp:$Rn, uimm12s2:$offset))))),
+ (SUBREG_TO_REG (i64 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
+
+// load zero-extended i8, bitcast to f64
+def : Pat <(f64 (bitconvert (i64 (zextloadi8 (am_indexed32 GPR64sp:$Rn, uimm12s1:$offset))))),
+ (SUBREG_TO_REG (i64 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
+
+// load zero-extended i16, bitcast to f32
+def : Pat <(f32 (bitconvert (i32 (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
+ (SUBREG_TO_REG (i32 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
+
+// load zero-extended i8, bitcast to f32
+def : Pat <(f32 (bitconvert (i32 (zextloadi8 (am_indexed16 GPR64sp:$Rn, uimm12s1:$offset))))),
+ (SUBREG_TO_REG (i32 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
+
// Pre-fetch.
def PRFMui : PrefetchUI<0b11, 0, 0b10, "prfm",
[(AArch64Prefetch timm:$Rt,
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
index 61bf87f..1a7609b 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -305,7 +305,8 @@ def GPR64pi48 : RegisterOperand<GPR64, "printPostIncOperand<48>">;
def GPR64pi64 : RegisterOperand<GPR64, "printPostIncOperand<64>">;
// Condition code regclass.
-def CCR : RegisterClass<"AArch64", [i32], 32, (add NZCV)> {
+defvar FlagsVT = i32;
+def CCR : RegisterClass<"AArch64", [FlagsVT], 32, (add NZCV)> {
let CopyCost = -1; // Don't allow copying of status registers.
// CCR is not allocatable.
diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index bafb8d0..8a5b5ba 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -32,10 +32,29 @@ AArch64SelectionDAGInfo::AArch64SelectionDAGInfo()
void AArch64SelectionDAGInfo::verifyTargetNode(const SelectionDAG &DAG,
const SDNode *N) const {
+ SelectionDAGGenTargetInfo::verifyTargetNode(DAG, N);
+
#ifndef NDEBUG
+ // Some additional checks not yet implemented by verifyTargetNode.
+ constexpr MVT FlagsVT = MVT::i32;
switch (N->getOpcode()) {
- default:
- return SelectionDAGGenTargetInfo::verifyTargetNode(DAG, N);
+ case AArch64ISD::SUBS:
+ assert(N->getValueType(1) == FlagsVT);
+ break;
+ case AArch64ISD::ADC:
+ case AArch64ISD::SBC:
+ assert(N->getOperand(2).getValueType() == FlagsVT);
+ break;
+ case AArch64ISD::ADCS:
+ case AArch64ISD::SBCS:
+ assert(N->getValueType(1) == FlagsVT);
+ assert(N->getOperand(2).getValueType() == FlagsVT);
+ break;
+ case AArch64ISD::CSEL:
+ case AArch64ISD::CSINC:
+ case AArch64ISD::BRCOND:
+ assert(N->getOperand(3).getValueType() == FlagsVT);
+ break;
case AArch64ISD::SADDWT:
case AArch64ISD::SADDWB:
case AArch64ISD::UADDWT:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp b/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp
index c218831..85de2d5 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp
@@ -36,7 +36,7 @@ void AArch64_ELFTargetObjectFile::Initialize(MCContext &Ctx,
// SHF_AARCH64_PURECODE flag set if the "+execute-only" target feature is
// present.
if (TM.getMCSubtargetInfo()->hasFeature(AArch64::FeatureExecuteOnly)) {
- auto *Text = cast<MCSectionELF>(TextSection);
+ auto *Text = static_cast<MCSectionELF *>(TextSection);
Text->setFlags(Text->getFlags() | ELF::SHF_AARCH64_PURECODE);
}
}
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index 08f547a..6257e99 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -523,7 +523,8 @@ void AArch64TargetELFStreamer::finish() {
// mark it execute-only if it is empty and there is at least one
// execute-only section in the object.
if (any_of(Asm, [](const MCSection &Sec) {
- return cast<MCSectionELF>(Sec).getFlags() & ELF::SHF_AARCH64_PURECODE;
+ return static_cast<const MCSectionELF &>(Sec).getFlags() &
+ ELF::SHF_AARCH64_PURECODE;
})) {
auto *Text =
static_cast<MCSectionELF *>(Ctx.getObjectFileInfo()->getTextSection());
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
index 3d4a14b..1a9bce5 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
@@ -9,8 +9,6 @@
#include "AArch64MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCStreamer.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/ErrorHandling.h"
using namespace llvm;
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
index 1ac340a..a22a17a 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
@@ -132,7 +132,8 @@ static bool canUseLocalRelocation(const MCSectionMachO &Section,
// But only if they don't point to a few forbidden sections.
if (!Symbol.isInSection())
return true;
- const MCSectionMachO &RefSec = cast<MCSectionMachO>(Symbol.getSection());
+ const MCSectionMachO &RefSec =
+ static_cast<MCSectionMachO &>(Symbol.getSection());
if (RefSec.getType() == MachO::S_CSTRING_LITERALS)
return false;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index e4e7bdc..8b8fc8b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -149,6 +149,12 @@ def FeatureFmaMixInsts : SubtargetFeature<"fma-mix-insts",
"Has v_fma_mix_f32, v_fma_mixlo_f16, v_fma_mixhi_f16 instructions"
>;
+def FeatureFmaMixBF16Insts : SubtargetFeature<"fma-mix-bf16-insts",
+ "HasFmaMixBF16Insts",
+ "true",
+ "Has v_fma_mix_f32_bf16, v_fma_mixlo_bf16, v_fma_mixhi_bf16 instructions"
+>;
+
def FeatureIEEEMinimumMaximumInsts : SubtargetFeature<"ieee-minimum-maximum-insts",
"HasIEEEMinimumMaximumInsts",
"true",
@@ -167,6 +173,12 @@ def FeatureMinimum3Maximum3F16 : SubtargetFeature<"minimum3-maximum3-f16",
"Has v_minimum3_f16 and v_maximum3_f16 instructions"
>;
+def FeatureMin3Max3PKF16 : SubtargetFeature<"min3-max3-pkf16",
+ "HasMin3Max3PKF16",
+ "true",
+ "Has v_pk_min3_num_f16 and v_pk_max3_num_f16 instructions"
+>;
+
def FeatureMinimum3Maximum3PKF16 : SubtargetFeature<"minimum3-maximum3-pkf16",
"HasMinimum3Maximum3PKF16",
"true",
@@ -256,12 +268,24 @@ def FeatureInstFwdPrefetchBug : SubtargetFeature<"inst-fwd-prefetch-bug",
"S_INST_PREFETCH instruction causes shader to hang"
>;
+def FeatureVmemPrefInsts : SubtargetFeature<"vmem-pref-insts",
+ "HasVmemPrefInsts",
+ "true",
+ "Has flat_prefect_b8 and global_prefetch_b8 instructions"
+>;
+
def FeatureSafeSmemPrefetch : SubtargetFeature<"safe-smem-prefetch",
"HasSafeSmemPrefetch",
"true",
"SMEM prefetches do not fail on illegal address"
>;
+def FeatureSafeCUPrefetch : SubtargetFeature<"safe-cu-prefetch",
+ "HasSafeCUPrefetch",
+ "true",
+ "VMEM CU scope prefetches do not fail on illegal address"
+>;
+
def FeatureVcmpxExecWARHazard : SubtargetFeature<"vcmpx-exec-war-hazard",
"HasVcmpxExecWARHazard",
"true",
@@ -559,6 +583,12 @@ def FeatureBF16ConversionInsts : SubtargetFeature<"bf16-cvt-insts",
"Has bf16 conversion instructions"
>;
+def FeatureBF16PackedInsts : SubtargetFeature<"bf16-pk-insts",
+ "HasBF16PackedInsts",
+ "true",
+ "Has bf16 packed instructions (fma, add, mul, max, min)"
+>;
+
def FeatureVOP3P : SubtargetFeature<"vop3p",
"HasVOP3PInsts",
"true",
@@ -1349,6 +1379,10 @@ def FeatureLshlAddU64Inst
: SubtargetFeature<"lshl-add-u64-inst", "HasLshlAddU64Inst", "true",
"Has v_lshl_add_u64 instruction">;
+def FeatureAddSubU64Insts
+ : SubtargetFeature<"add-sub-u64-insts", "HasAddSubU64Insts", "true",
+ "Has v_add_u64 and v_sub_u64 instructions">;
+
def FeatureMemToLDSLoad : SubtargetFeature<"vmem-to-lds-load-insts",
"HasVMemToLDSLoad",
"true",
@@ -1989,7 +2023,10 @@ def FeatureISAVersion12_50 : FeatureSet<
FeatureTransposeLoadF4F6Insts,
FeatureBF16TransInsts,
FeatureBF16ConversionInsts,
+ FeatureBF16PackedInsts,
FeatureCvtPkF16F32Inst,
+ FeatureFmaMixBF16Insts,
+ FeatureMin3Max3PKF16,
FeatureMinimum3Maximum3PKF16,
FeaturePrngInst,
FeaturePermlane16Swap,
@@ -2002,7 +2039,9 @@ def FeatureISAVersion12_50 : FeatureSet<
FeatureFlatBufferGlobalAtomicFaddF64Inst,
FeatureMemoryAtomicFAddF32DenormalSupport,
FeatureKernargPreload,
+ FeatureVmemPrefInsts,
FeatureLshlAddU64Inst,
+ FeatureAddSubU64Insts,
FeatureLdsBarrierArriveAtomic,
FeatureSetPrioIncWgInst,
]>;
@@ -2349,6 +2388,10 @@ def HasMinimum3Maximum3F16 :
Predicate<"Subtarget->hasMinimum3Maximum3F16()">,
AssemblerPredicate<(all_of FeatureMinimum3Maximum3F16)>;
+def HasMin3Max3PKF16 :
+ Predicate<"Subtarget->hasMin3Max3PKF16()">,
+ AssemblerPredicate<(all_of FeatureMin3Max3PKF16)>;
+
def HasMinimum3Maximum3PKF16 :
Predicate<"Subtarget->hasMinimum3Maximum3PKF16()">,
AssemblerPredicate<(all_of FeatureMinimum3Maximum3PKF16)>;
@@ -2472,6 +2515,9 @@ def HasBF16TransInsts : Predicate<"Subtarget->hasBF16TransInsts()">,
def HasBF16ConversionInsts : Predicate<"Subtarget->hasBF16ConversionInsts()">,
AssemblerPredicate<(all_of FeatureBF16ConversionInsts)>;
+def HasBF16PackedInsts : Predicate<"Subtarget->hasBF16PackedInsts()">,
+ AssemblerPredicate<(all_of FeatureBF16PackedInsts)>;
+
def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">,
AssemblerPredicate<(all_of FeatureVOP3P)>;
@@ -2573,6 +2619,9 @@ def HasMovrel : Predicate<"Subtarget->hasMovrel()">,
def HasFmaMixInsts : Predicate<"Subtarget->hasFmaMixInsts()">,
AssemblerPredicate<(all_of FeatureFmaMixInsts)>;
+def HasFmaMixBF16Insts : Predicate<"Subtarget->hasFmaMixBF16Insts()">,
+ AssemblerPredicate<(all_of FeatureFmaMixBF16Insts)>;
+
def HasDLInsts : Predicate<"Subtarget->hasDLInsts()">,
AssemblerPredicate<(all_of FeatureDLInsts)>;
@@ -2771,12 +2820,18 @@ def HasScalarDwordx3Loads : Predicate<"Subtarget->hasScalarDwordx3Loads()">;
def HasXF32Insts : Predicate<"Subtarget->hasXF32Insts()">,
AssemblerPredicate<(all_of FeatureXF32Insts)>;
+def HasVmemPrefInsts : Predicate<"Subtarget->hasVmemPrefInsts()">,
+ AssemblerPredicate<(all_of FeatureVmemPrefInsts)>;
+
def HasAshrPkInsts : Predicate<"Subtarget->hasAshrPkInsts()">,
AssemblerPredicate<(all_of FeatureAshrPkInsts)>;
def HasLshlAddU64Inst : Predicate<"Subtarget->hasLshlAddU64Inst()">,
AssemblerPredicate<(all_of FeatureLshlAddU64Inst)>;
+def HasAddSubU64Insts : Predicate<"Subtarget->hasAddSubU64Insts()">,
+ AssemblerPredicate<(all_of FeatureAddSubU64Insts)>;
+
def HasLdsBarrierArriveAtomic : Predicate<"Subtarget->hasLdsBarrierArriveAtomic()">,
AssemblerPredicate<(all_of FeatureLdsBarrierArriveAtomic)>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index dedee46..49d8b44 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -1383,7 +1383,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
&AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID,
&AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID,
&AAUnderlyingObjects::ID, &AANoAliasAddrSpace::ID, &AAAddressSpace::ID,
- &AAIndirectCallInfo::ID, &AAInstanceInfo::ID});
+ &AAIndirectCallInfo::ID});
AttributorConfig AC(CGUpdater);
AC.IsClosedWorldModule = Options.IsClosedWorld;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 891d362..c01e5d3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -137,6 +137,9 @@ def gi_global_offset :
def gi_global_saddr :
GIComplexOperandMatcher<s64, "selectGlobalSAddr">,
GIComplexPatternEquiv<GlobalSAddr>;
+def gi_global_saddr_cpol :
+ GIComplexOperandMatcher<s64, "selectGlobalSAddrCPol">,
+ GIComplexPatternEquiv<GlobalSAddrCPol>;
def gi_global_saddr_glc :
GIComplexOperandMatcher<s64, "selectGlobalSAddrGLC">,
GIComplexPatternEquiv<GlobalSAddrGLC>;
@@ -446,5 +449,8 @@ def gi_fp_pow2_to_exponent : GICustomOperandRenderer<"renderFPPow2ToExponent">,
def gi_as_hw_round_mode : GICustomOperandRenderer<"renderRoundMode">,
GISDNodeXFormEquiv<as_hw_round_mode>;
+def gi_prefetch_loc : GICustomOperandRenderer<"renderPrefetchLoc">,
+ GISDNodeXFormEquiv<PrefetchLoc>;
+
def gi_MFMALdScaleModifierOp : GICustomOperandRenderer<"renderScaledMAIIntrinsicOperand">,
GISDNodeXFormEquiv<MFMALdScaleXForm>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
index 00979f4..f36935d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
@@ -117,45 +117,72 @@ static LLT getReadAnyLaneSplitTy(LLT Ty) {
return LLT::scalar(32);
}
-static Register buildReadAnyLane(MachineIRBuilder &B, Register VgprSrc,
- const RegisterBankInfo &RBI);
-
-static void unmergeReadAnyLane(MachineIRBuilder &B,
- SmallVectorImpl<Register> &SgprDstParts,
- LLT UnmergeTy, Register VgprSrc,
- const RegisterBankInfo &RBI) {
+template <typename ReadLaneFnTy>
+static Register buildReadLane(MachineIRBuilder &, Register,
+ const RegisterBankInfo &, ReadLaneFnTy);
+
+template <typename ReadLaneFnTy>
+static void
+unmergeReadAnyLane(MachineIRBuilder &B, SmallVectorImpl<Register> &SgprDstParts,
+ LLT UnmergeTy, Register VgprSrc, const RegisterBankInfo &RBI,
+ ReadLaneFnTy BuildRL) {
const RegisterBank *VgprRB = &RBI.getRegBank(AMDGPU::VGPRRegBankID);
auto Unmerge = B.buildUnmerge({VgprRB, UnmergeTy}, VgprSrc);
for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) {
- SgprDstParts.push_back(buildReadAnyLane(B, Unmerge.getReg(i), RBI));
+ SgprDstParts.push_back(buildReadLane(B, Unmerge.getReg(i), RBI, BuildRL));
}
}
-static Register buildReadAnyLane(MachineIRBuilder &B, Register VgprSrc,
- const RegisterBankInfo &RBI) {
+template <typename ReadLaneFnTy>
+static Register buildReadLane(MachineIRBuilder &B, Register VgprSrc,
+ const RegisterBankInfo &RBI,
+ ReadLaneFnTy BuildRL) {
LLT Ty = B.getMRI()->getType(VgprSrc);
const RegisterBank *SgprRB = &RBI.getRegBank(AMDGPU::SGPRRegBankID);
if (Ty.getSizeInBits() == 32) {
- return B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {{SgprRB, Ty}}, {VgprSrc})
- .getReg(0);
+ Register SgprDst = B.getMRI()->createVirtualRegister({SgprRB, Ty});
+ return BuildRL(B, SgprDst, VgprSrc).getReg(0);
}
SmallVector<Register, 8> SgprDstParts;
- unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI);
+ unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI,
+ BuildRL);
return B.buildMergeLikeInstr({SgprRB, Ty}, SgprDstParts).getReg(0);
}
-void AMDGPU::buildReadAnyLane(MachineIRBuilder &B, Register SgprDst,
- Register VgprSrc, const RegisterBankInfo &RBI) {
+template <typename ReadLaneFnTy>
+static void buildReadLane(MachineIRBuilder &B, Register SgprDst,
+ Register VgprSrc, const RegisterBankInfo &RBI,
+ ReadLaneFnTy BuildReadLane) {
LLT Ty = B.getMRI()->getType(VgprSrc);
if (Ty.getSizeInBits() == 32) {
- B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {SgprDst}, {VgprSrc});
+ BuildReadLane(B, SgprDst, VgprSrc);
return;
}
SmallVector<Register, 8> SgprDstParts;
- unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI);
+ unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI,
+ BuildReadLane);
B.buildMergeLikeInstr(SgprDst, SgprDstParts).getReg(0);
}
+
+void AMDGPU::buildReadAnyLane(MachineIRBuilder &B, Register SgprDst,
+ Register VgprSrc, const RegisterBankInfo &RBI) {
+ return buildReadLane(
+ B, SgprDst, VgprSrc, RBI,
+ [](MachineIRBuilder &B, Register SgprDst, Register VgprSrc) {
+ return B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {SgprDst}, {VgprSrc});
+ });
+}
+
+void AMDGPU::buildReadFirstLane(MachineIRBuilder &B, Register SgprDst,
+ Register VgprSrc, const RegisterBankInfo &RBI) {
+ return buildReadLane(
+ B, SgprDst, VgprSrc, RBI,
+ [](MachineIRBuilder &B, Register SgprDst, Register VgprSrc) {
+ return B.buildIntrinsic(Intrinsic::amdgcn_readfirstlane, SgprDst)
+ .addReg(VgprSrc);
+ });
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
index 0c89bb5..5e1000e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
@@ -51,6 +51,8 @@ private:
void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc,
const RegisterBankInfo &RBI);
+void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc,
+ const RegisterBankInfo &RBI);
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 5a2416de..dfaa145 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -2020,6 +2020,22 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
return true;
}
+bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPol(SDNode *N, SDValue Addr,
+ SDValue &SAddr, SDValue &VOffset,
+ SDValue &Offset,
+ SDValue &CPol) const {
+ bool ScaleOffset;
+ if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
+ return false;
+
+ // We are assuming CPol is always the last operand of the intrinsic.
+ auto PassedCPol =
+ N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL;
+ CPol = CurDAG->getTargetConstant(
+ (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
+ return true;
+}
+
bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr,
SDValue &SAddr, SDValue &VOffset,
SDValue &Offset,
@@ -3861,58 +3877,114 @@ bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
return SelectVOP3Mods(In, Src, SrcMods);
}
+// Match lowered fpext from bf16 to f32. This is a bit operation extending
+// a 16-bit value with 16-bit of zeroes at LSB:
+//
+// 1. (f32 (bitcast (build_vector (i16 0), (i16 (bitcast bf16:val)))))
+// 2. (f32 (bitcast (and i32:val, 0xffff0000))) -> IsExtractHigh = true
+// 3. (f32 (bitcast (shl i32:va, 16) -> IsExtractHigh = false
+static SDValue matchBF16FPExtendLike(SDValue Op, bool &IsExtractHigh) {
+ if (Op.getValueType() != MVT::f32 || Op.getOpcode() != ISD::BITCAST)
+ return SDValue();
+ Op = Op.getOperand(0);
+
+ IsExtractHigh = false;
+ if (Op.getValueType() == MVT::v2i16 && Op.getOpcode() == ISD::BUILD_VECTOR) {
+ auto Low16 = dyn_cast<ConstantSDNode>(Op.getOperand(0));
+ if (!Low16 || !Low16->isZero())
+ return SDValue();
+ Op = stripBitcast(Op.getOperand(1));
+ if (Op.getValueType() != MVT::bf16)
+ return SDValue();
+ return Op;
+ }
+
+ if (Op.getValueType() != MVT::i32)
+ return SDValue();
+
+ if (Op.getOpcode() == ISD::AND) {
+ if (auto Mask = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+ if (Mask->getZExtValue() == 0xffff0000) {
+ IsExtractHigh = true;
+ return Op.getOperand(0);
+ }
+ }
+ return SDValue();
+ }
+
+ if (Op.getOpcode() == ISD::SHL) {
+ if (auto Amt = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+ if (Amt->getZExtValue() == 16)
+ return Op.getOperand(0);
+ }
+ }
+
+ return SDValue();
+}
+
// The return value is not whether the match is possible (which it always is),
// but whether or not it a conversion is really used.
bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
- unsigned &Mods) const {
+ unsigned &Mods,
+ MVT VT) const {
Mods = 0;
SelectVOP3ModsImpl(In, Src, Mods);
+ bool IsExtractHigh = false;
if (Src.getOpcode() == ISD::FP_EXTEND) {
Src = Src.getOperand(0);
- assert(Src.getValueType() == MVT::f16);
- Src = stripBitcast(Src);
+ } else if (VT == MVT::bf16) {
+ SDValue B16 = matchBF16FPExtendLike(Src, IsExtractHigh);
+ if (!B16)
+ return false;
+ Src = B16;
+ } else
+ return false;
- // Be careful about folding modifiers if we already have an abs. fneg is
- // applied last, so we don't want to apply an earlier fneg.
- if ((Mods & SISrcMods::ABS) == 0) {
- unsigned ModsTmp;
- SelectVOP3ModsImpl(Src, Src, ModsTmp);
+ if (Src.getValueType() != VT &&
+ (VT != MVT::bf16 || Src.getValueType() != MVT::i32))
+ return false;
- if ((ModsTmp & SISrcMods::NEG) != 0)
- Mods ^= SISrcMods::NEG;
+ Src = stripBitcast(Src);
- if ((ModsTmp & SISrcMods::ABS) != 0)
- Mods |= SISrcMods::ABS;
- }
+ // Be careful about folding modifiers if we already have an abs. fneg is
+ // applied last, so we don't want to apply an earlier fneg.
+ if ((Mods & SISrcMods::ABS) == 0) {
+ unsigned ModsTmp;
+ SelectVOP3ModsImpl(Src, Src, ModsTmp);
- // op_sel/op_sel_hi decide the source type and source.
- // If the source's op_sel_hi is set, it indicates to do a conversion from fp16.
- // If the sources's op_sel is set, it picks the high half of the source
- // register.
+ if ((ModsTmp & SISrcMods::NEG) != 0)
+ Mods ^= SISrcMods::NEG;
- Mods |= SISrcMods::OP_SEL_1;
- if (isExtractHiElt(Src, Src)) {
- Mods |= SISrcMods::OP_SEL_0;
+ if ((ModsTmp & SISrcMods::ABS) != 0)
+ Mods |= SISrcMods::ABS;
+ }
- // TODO: Should we try to look for neg/abs here?
- }
+ // op_sel/op_sel_hi decide the source type and source.
+ // If the source's op_sel_hi is set, it indicates to do a conversion from
+ // fp16. If the sources's op_sel is set, it picks the high half of the source
+ // register.
- // Prevent unnecessary subreg COPY to VGPR_16
- if (Src.getOpcode() == ISD::TRUNCATE &&
- Src.getOperand(0).getValueType() == MVT::i32) {
- Src = Src.getOperand(0);
- }
- return true;
+ Mods |= SISrcMods::OP_SEL_1;
+ if (IsExtractHigh ||
+ (Src.getValueSizeInBits() == 16 && isExtractHiElt(Src, Src))) {
+ Mods |= SISrcMods::OP_SEL_0;
+
+ // TODO: Should we try to look for neg/abs here?
}
- return false;
+ // Prevent unnecessary subreg COPY to VGPR_16
+ if (Src.getOpcode() == ISD::TRUNCATE &&
+ Src.getOperand(0).getValueType() == MVT::i32) {
+ Src = Src.getOperand(0);
+ }
+ return true;
}
bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
SDValue &SrcMods) const {
unsigned Mods = 0;
- if (!SelectVOP3PMadMixModsImpl(In, Src, Mods))
+ if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16))
return false;
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
return true;
@@ -3921,7 +3993,24 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
SDValue &SrcMods) const {
unsigned Mods = 0;
- SelectVOP3PMadMixModsImpl(In, Src, Mods);
+ SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16);
+ SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+ return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16ModsExt(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const {
+ unsigned Mods = 0;
+ if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16))
+ return false;
+ SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+ return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const {
+ unsigned Mods = 0;
+ SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16);
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 6123d75..5636d89 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -19,6 +19,7 @@
#include "SIModeRegisterDefaults.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
#include "llvm/Target/TargetMachine.h"
namespace llvm {
@@ -167,6 +168,9 @@ private:
bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
SDValue &VOffset, SDValue &Offset,
SDValue &CPol) const;
+ bool SelectGlobalSAddrCPol(SDNode *N, SDValue Addr, SDValue &SAddr,
+ SDValue &VOffset, SDValue &Offset,
+ SDValue &CPol) const;
bool SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, SDValue &SAddr,
SDValue &VOffset, SDValue &Offset,
SDValue &CPol) const;
@@ -254,11 +258,15 @@ private:
bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const;
bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
- bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
- unsigned &Mods) const;
+ bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, unsigned &Mods,
+ MVT VT) const;
bool SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
SDValue &SrcMods) const;
bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+ bool SelectVOP3PMadMixBF16ModsExt(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const;
+ bool SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const;
bool SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1, SDValue &Src2,
SDValue &Tbl) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 877c3ac..266dee1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -5774,6 +5774,16 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
}
InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectGlobalSAddrCPol(MachineOperand &Root) const {
+ const MachineInstr &I = *Root.getParent();
+
+ // We are assuming CPol is always the last operand of the intrinsic.
+ auto PassedCPol =
+ I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
+ return selectGlobalSAddr(Root, PassedCPol);
+}
+
+InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const {
return selectGlobalSAddr(Root, AMDGPU::CPol::GLC);
}
@@ -7068,6 +7078,17 @@ void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB,
MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4);
}
+void AMDGPUInstructionSelector::renderPrefetchLoc(MachineInstrBuilder &MIB,
+ const MachineInstr &MI,
+ int OpIdx) const {
+ uint32_t V = MI.getOperand(2).getImm();
+ V = (AMDGPU::CPol::SCOPE_MASK - (V & AMDGPU::CPol::SCOPE_MASK))
+ << AMDGPU::CPol::SCOPE_SHIFT;
+ if (!Subtarget->hasSafeCUPrefetch())
+ V = std::max(V, (uint32_t)AMDGPU::CPol::SCOPE_SE); // CU scope is unsafe
+ MIB.addImm(V);
+}
+
/// Convert from 2-bit value to enum values used for op_sel* source modifiers.
void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand(
MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 5f7f05c..fe9743d0a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -261,6 +261,8 @@ private:
InstructionSelector::ComplexRendererFns
selectGlobalSAddr(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
+ selectGlobalSAddrCPol(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
selectGlobalSAddrGLC(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
@@ -414,6 +416,10 @@ private:
void renderRoundMode(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx) const;
+
+ void renderPrefetchLoc(MachineInstrBuilder &MIB, const MachineInstr &MI,
+ int OpIdx) const;
+
void renderScaledMAIIntrinsicOperand(MachineInstrBuilder &MIB,
const MachineInstr &MI, int OpIdx) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index e7bf88d..fedfa3f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -4208,6 +4208,9 @@ bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
assert(Ty.isScalar());
unsigned Size = Ty.getSizeInBits();
+ if (ST.hasVectorMulU64() && Size == 64)
+ return true;
+
unsigned NumParts = Size / 32;
assert((Size % 32) == 0);
assert(NumParts >= 2);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
index ba66134..e187959 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
@@ -23,6 +23,8 @@
#include "GCNSubtarget.h"
#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
#include "llvm/CodeGen/TargetPassConfig.h"
@@ -115,126 +117,233 @@ public:
VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {};
- bool isLaneMask(Register Reg) {
- const RegisterBank *RB = MRI.getRegBankOrNull(Reg);
- if (RB && RB->getID() == AMDGPU::VCCRegBankID)
- return true;
+ bool isLaneMask(Register Reg);
+ std::pair<MachineInstr *, Register> tryMatch(Register Src, unsigned Opcode);
+ std::pair<GUnmerge *, int> tryMatchRALFromUnmerge(Register Src);
+ Register getReadAnyLaneSrc(Register Src);
+ void replaceRegWithOrBuildCopy(Register Dst, Register Src);
- const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
- return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1);
- }
+ bool tryEliminateReadAnyLane(MachineInstr &Copy);
+ void tryCombineCopy(MachineInstr &MI);
+ void tryCombineS1AnyExt(MachineInstr &MI);
+};
- void cleanUpAfterCombine(MachineInstr &MI, MachineInstr *Optional0) {
- MI.eraseFromParent();
- if (Optional0 && isTriviallyDead(*Optional0, MRI))
- Optional0->eraseFromParent();
- }
+bool AMDGPURegBankLegalizeCombiner::isLaneMask(Register Reg) {
+ const RegisterBank *RB = MRI.getRegBankOrNull(Reg);
+ if (RB && RB->getID() == AMDGPU::VCCRegBankID)
+ return true;
- std::pair<MachineInstr *, Register> tryMatch(Register Src, unsigned Opcode) {
- MachineInstr *MatchMI = MRI.getVRegDef(Src);
- if (MatchMI->getOpcode() != Opcode)
- return {nullptr, Register()};
- return {MatchMI, MatchMI->getOperand(1).getReg()};
- }
+ const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
+ return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1);
+}
- void tryCombineCopy(MachineInstr &MI) {
- Register Dst = MI.getOperand(0).getReg();
- Register Src = MI.getOperand(1).getReg();
- // Skip copies of physical registers.
- if (!Dst.isVirtual() || !Src.isVirtual())
- return;
-
- // This is a cross bank copy, sgpr S1 to lane mask.
- //
- // %Src:sgpr(s1) = G_TRUNC %TruncS32Src:sgpr(s32)
- // %Dst:lane-mask(s1) = COPY %Src:sgpr(s1)
- // ->
- // %Dst:lane-mask(s1) = G_AMDGPU_COPY_VCC_SCC %TruncS32Src:sgpr(s32)
- if (isLaneMask(Dst) && MRI.getRegBankOrNull(Src) == SgprRB) {
- auto [Trunc, TruncS32Src] = tryMatch(Src, AMDGPU::G_TRUNC);
- assert(Trunc && MRI.getType(TruncS32Src) == S32 &&
- "sgpr S1 must be result of G_TRUNC of sgpr S32");
-
- B.setInstr(MI);
- // Ensure that truncated bits in BoolSrc are 0.
- auto One = B.buildConstant({SgprRB, S32}, 1);
- auto BoolSrc = B.buildAnd({SgprRB, S32}, TruncS32Src, One);
- B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {Dst}, {BoolSrc});
- cleanUpAfterCombine(MI, Trunc);
- return;
- }
+std::pair<MachineInstr *, Register>
+AMDGPURegBankLegalizeCombiner::tryMatch(Register Src, unsigned Opcode) {
+ MachineInstr *MatchMI = MRI.getVRegDef(Src);
+ if (MatchMI->getOpcode() != Opcode)
+ return {nullptr, Register()};
+ return {MatchMI, MatchMI->getOperand(1).getReg()};
+}
+
+std::pair<GUnmerge *, int>
+AMDGPURegBankLegalizeCombiner::tryMatchRALFromUnmerge(Register Src) {
+ MachineInstr *ReadAnyLane = MRI.getVRegDef(Src);
+ if (ReadAnyLane->getOpcode() != AMDGPU::G_AMDGPU_READANYLANE)
+ return {nullptr, -1};
+
+ Register RALSrc = ReadAnyLane->getOperand(1).getReg();
+ if (auto *UnMerge = getOpcodeDef<GUnmerge>(RALSrc, MRI))
+ return {UnMerge, UnMerge->findRegisterDefOperandIdx(RALSrc, nullptr)};
- // Src = G_AMDGPU_READANYLANE RALSrc
- // Dst = COPY Src
- // ->
- // Dst = RALSrc
- if (MRI.getRegBankOrNull(Dst) == VgprRB &&
- MRI.getRegBankOrNull(Src) == SgprRB) {
- auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE);
- if (!RAL)
- return;
-
- assert(MRI.getRegBank(RALSrc) == VgprRB);
- MRI.replaceRegWith(Dst, RALSrc);
- cleanUpAfterCombine(MI, RAL);
- return;
+ return {nullptr, -1};
+}
+
+Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) {
+ // Src = G_AMDGPU_READANYLANE RALSrc
+ auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE);
+ if (RAL)
+ return RALSrc;
+
+ // LoVgpr, HiVgpr = G_UNMERGE_VALUES UnmergeSrc
+ // LoSgpr = G_AMDGPU_READANYLANE LoVgpr
+ // HiSgpr = G_AMDGPU_READANYLANE HiVgpr
+ // Src G_MERGE_VALUES LoSgpr, HiSgpr
+ auto *Merge = getOpcodeDef<GMergeLikeInstr>(Src, MRI);
+ if (Merge) {
+ unsigned NumElts = Merge->getNumSources();
+ auto [Unmerge, Idx] = tryMatchRALFromUnmerge(Merge->getSourceReg(0));
+ if (!Unmerge || Unmerge->getNumDefs() != NumElts || Idx != 0)
+ return {};
+
+ // Check if all elements are from same unmerge and there is no shuffling.
+ for (unsigned i = 1; i < NumElts; ++i) {
+ auto [UnmergeI, IdxI] = tryMatchRALFromUnmerge(Merge->getSourceReg(i));
+ if (UnmergeI != Unmerge || (unsigned)IdxI != i)
+ return {};
}
+ return Unmerge->getSourceReg();
}
- void tryCombineS1AnyExt(MachineInstr &MI) {
- // %Src:sgpr(S1) = G_TRUNC %TruncSrc
- // %Dst = G_ANYEXT %Src:sgpr(S1)
- // ->
- // %Dst = G_... %TruncSrc
- Register Dst = MI.getOperand(0).getReg();
- Register Src = MI.getOperand(1).getReg();
- if (MRI.getType(Src) != S1)
- return;
-
- auto [Trunc, TruncSrc] = tryMatch(Src, AMDGPU::G_TRUNC);
- if (!Trunc)
- return;
-
- LLT DstTy = MRI.getType(Dst);
- LLT TruncSrcTy = MRI.getType(TruncSrc);
-
- if (DstTy == TruncSrcTy) {
- MRI.replaceRegWith(Dst, TruncSrc);
- cleanUpAfterCombine(MI, Trunc);
- return;
- }
+ // SrcRegIdx = G_AMDGPU_READANYLANE RALElSrc
+ // SourceReg G_MERGE_VALUES ..., SrcRegIdx, ...
+ // ..., Src, ... = G_UNMERGE_VALUES SourceReg
+ auto *UnMerge = getOpcodeDef<GUnmerge>(Src, MRI);
+ if (!UnMerge)
+ return {};
+
+ int Idx = UnMerge->findRegisterDefOperandIdx(Src, nullptr);
+ Merge = getOpcodeDef<GMergeLikeInstr>(UnMerge->getSourceReg(), MRI);
+ if (!Merge || UnMerge->getNumDefs() != Merge->getNumSources())
+ return {};
+
+ Register SrcRegIdx = Merge->getSourceReg(Idx);
+ if (MRI.getType(Src) != MRI.getType(SrcRegIdx))
+ return {};
+
+ auto [RALEl, RALElSrc] = tryMatch(SrcRegIdx, AMDGPU::G_AMDGPU_READANYLANE);
+ if (RALEl)
+ return RALElSrc;
+
+ return {};
+}
+
+void AMDGPURegBankLegalizeCombiner::replaceRegWithOrBuildCopy(Register Dst,
+ Register Src) {
+ if (Dst.isVirtual())
+ MRI.replaceRegWith(Dst, Src);
+ else
+ B.buildCopy(Dst, Src);
+}
+
+bool AMDGPURegBankLegalizeCombiner::tryEliminateReadAnyLane(
+ MachineInstr &Copy) {
+ Register Dst = Copy.getOperand(0).getReg();
+ Register Src = Copy.getOperand(1).getReg();
+
+ // Skip non-vgpr Dst
+ if (Dst.isVirtual() ? (MRI.getRegBankOrNull(Dst) != VgprRB)
+ : !TRI.isVGPR(MRI, Dst))
+ return false;
+
+ // Skip physical source registers and source registers with register class
+ if (!Src.isVirtual() || MRI.getRegClassOrNull(Src))
+ return false;
+
+ Register RALDst = Src;
+ MachineInstr &SrcMI = *MRI.getVRegDef(Src);
+ if (SrcMI.getOpcode() == AMDGPU::G_BITCAST)
+ RALDst = SrcMI.getOperand(1).getReg();
+
+ Register RALSrc = getReadAnyLaneSrc(RALDst);
+ if (!RALSrc)
+ return false;
+
+ B.setInstr(Copy);
+ if (SrcMI.getOpcode() != AMDGPU::G_BITCAST) {
+ // Src = READANYLANE RALSrc Src = READANYLANE RALSrc
+ // Dst = Copy Src $Dst = Copy Src
+ // -> ->
+ // Dst = RALSrc $Dst = Copy RALSrc
+ replaceRegWithOrBuildCopy(Dst, RALSrc);
+ } else {
+ // RALDst = READANYLANE RALSrc RALDst = READANYLANE RALSrc
+ // Src = G_BITCAST RALDst Src = G_BITCAST RALDst
+ // Dst = Copy Src Dst = Copy Src
+ // -> ->
+ // NewVgpr = G_BITCAST RALDst NewVgpr = G_BITCAST RALDst
+ // Dst = NewVgpr $Dst = Copy NewVgpr
+ auto Bitcast = B.buildBitcast({VgprRB, MRI.getType(Src)}, RALSrc);
+ replaceRegWithOrBuildCopy(Dst, Bitcast.getReg(0));
+ }
+
+ eraseInstr(Copy, MRI);
+ return true;
+}
+
+void AMDGPURegBankLegalizeCombiner::tryCombineCopy(MachineInstr &MI) {
+ if (tryEliminateReadAnyLane(MI))
+ return;
+
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
+ // Skip copies of physical registers.
+ if (!Dst.isVirtual() || !Src.isVirtual())
+ return;
+
+ // This is a cross bank copy, sgpr S1 to lane mask.
+ //
+ // %Src:sgpr(s1) = G_TRUNC %TruncS32Src:sgpr(s32)
+ // %Dst:lane-mask(s1) = COPY %Src:sgpr(s1)
+ // ->
+ // %BoolSrc:sgpr(s32) = G_AND %TruncS32Src:sgpr(s32), 1
+ // %Dst:lane-mask(s1) = G_AMDGPU_COPY_VCC_SCC %BoolSrc:sgpr(s32)
+ if (isLaneMask(Dst) && MRI.getRegBankOrNull(Src) == SgprRB) {
+ auto [Trunc, TruncS32Src] = tryMatch(Src, AMDGPU::G_TRUNC);
+ assert(Trunc && MRI.getType(TruncS32Src) == S32 &&
+ "sgpr S1 must be result of G_TRUNC of sgpr S32");
B.setInstr(MI);
+ // Ensure that truncated bits in BoolSrc are 0.
+ auto One = B.buildConstant({SgprRB, S32}, 1);
+ auto BoolSrc = B.buildAnd({SgprRB, S32}, TruncS32Src, One);
+ B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {Dst}, {BoolSrc});
+ eraseInstr(MI, MRI);
+ }
+}
- if (DstTy == S32 && TruncSrcTy == S64) {
- auto Unmerge = B.buildUnmerge({SgprRB, S32}, TruncSrc);
- MRI.replaceRegWith(Dst, Unmerge.getReg(0));
- cleanUpAfterCombine(MI, Trunc);
- return;
- }
+void AMDGPURegBankLegalizeCombiner::tryCombineS1AnyExt(MachineInstr &MI) {
+ // %Src:sgpr(S1) = G_TRUNC %TruncSrc
+ // %Dst = G_ANYEXT %Src:sgpr(S1)
+ // ->
+ // %Dst = G_... %TruncSrc
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
+ if (MRI.getType(Src) != S1)
+ return;
+
+ auto [Trunc, TruncSrc] = tryMatch(Src, AMDGPU::G_TRUNC);
+ if (!Trunc)
+ return;
+
+ LLT DstTy = MRI.getType(Dst);
+ LLT TruncSrcTy = MRI.getType(TruncSrc);
+
+ if (DstTy == TruncSrcTy) {
+ MRI.replaceRegWith(Dst, TruncSrc);
+ eraseInstr(MI, MRI);
+ return;
+ }
- if (DstTy == S64 && TruncSrcTy == S32) {
- B.buildMergeLikeInstr(MI.getOperand(0).getReg(),
- {TruncSrc, B.buildUndef({SgprRB, S32})});
- cleanUpAfterCombine(MI, Trunc);
- return;
- }
+ B.setInstr(MI);
- if (DstTy == S32 && TruncSrcTy == S16) {
- B.buildAnyExt(Dst, TruncSrc);
- cleanUpAfterCombine(MI, Trunc);
- return;
- }
+ if (DstTy == S32 && TruncSrcTy == S64) {
+ auto Unmerge = B.buildUnmerge({SgprRB, S32}, TruncSrc);
+ MRI.replaceRegWith(Dst, Unmerge.getReg(0));
+ eraseInstr(MI, MRI);
+ return;
+ }
- if (DstTy == S16 && TruncSrcTy == S32) {
- B.buildTrunc(Dst, TruncSrc);
- cleanUpAfterCombine(MI, Trunc);
- return;
- }
+ if (DstTy == S64 && TruncSrcTy == S32) {
+ B.buildMergeLikeInstr(MI.getOperand(0).getReg(),
+ {TruncSrc, B.buildUndef({SgprRB, S32})});
+ eraseInstr(MI, MRI);
+ return;
+ }
- llvm_unreachable("missing anyext + trunc combine");
+ if (DstTy == S32 && TruncSrcTy == S16) {
+ B.buildAnyExt(Dst, TruncSrc);
+ eraseInstr(MI, MRI);
+ return;
}
-};
+
+ if (DstTy == S16 && TruncSrcTy == S32) {
+ B.buildTrunc(Dst, TruncSrc);
+ eraseInstr(MI, MRI);
+ return;
+ }
+
+ llvm_unreachable("missing anyext + trunc combine");
+}
// Search through MRI for virtual registers with sgpr register bank and S1 LLT.
[[maybe_unused]] static Register getAnySgprS1(const MachineRegisterInfo &MRI) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index 411159c..f471881 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -33,7 +33,7 @@ RegBankLegalizeHelper::RegBankLegalizeHelper(
MachineIRBuilder &B, const MachineUniformityInfo &MUI,
const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
: ST(B.getMF().getSubtarget<GCNSubtarget>()), B(B), MRI(*B.getMRI()),
- MUI(MUI), RBI(RBI), RBLRules(RBLRules),
+ MUI(MUI), RBI(RBI), RBLRules(RBLRules), IsWave32(ST.isWave32()),
SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}
@@ -56,6 +56,224 @@ void RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) {
lower(MI, Mapping, WaterfallSgprs);
}
+bool RegBankLegalizeHelper::executeInWaterfallLoop(
+ MachineIRBuilder &B, iterator_range<MachineBasicBlock::iterator> Range,
+ SmallSet<Register, 4> &SGPROperandRegs) {
+ // Track use registers which have already been expanded with a readfirstlane
+ // sequence. This may have multiple uses if moving a sequence.
+ DenseMap<Register, Register> WaterfalledRegMap;
+
+ MachineBasicBlock &MBB = B.getMBB();
+ MachineFunction &MF = B.getMF();
+
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
+ unsigned MovExecOpc, MovExecTermOpc, XorTermOpc, AndSaveExecOpc, ExecReg;
+ if (IsWave32) {
+ MovExecOpc = AMDGPU::S_MOV_B32;
+ MovExecTermOpc = AMDGPU::S_MOV_B32_term;
+ XorTermOpc = AMDGPU::S_XOR_B32_term;
+ AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
+ ExecReg = AMDGPU::EXEC_LO;
+ } else {
+ MovExecOpc = AMDGPU::S_MOV_B64;
+ MovExecTermOpc = AMDGPU::S_MOV_B64_term;
+ XorTermOpc = AMDGPU::S_XOR_B64_term;
+ AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
+ ExecReg = AMDGPU::EXEC;
+ }
+
+#ifndef NDEBUG
+ const int OrigRangeSize = std::distance(Range.begin(), Range.end());
+#endif
+
+ MachineRegisterInfo &MRI = *B.getMRI();
+ Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
+ Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
+
+ // Don't bother using generic instructions/registers for the exec mask.
+ B.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(InitSaveExecReg);
+
+ Register SavedExec = MRI.createVirtualRegister(WaveRC);
+
+ // To insert the loop we need to split the block. Move everything before
+ // this point to a new block, and insert a new empty block before this
+ // instruction.
+ MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock();
+ MachineBasicBlock *BodyBB = MF.CreateMachineBasicBlock();
+ MachineBasicBlock *RestoreExecBB = MF.CreateMachineBasicBlock();
+ MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
+ MachineFunction::iterator MBBI(MBB);
+ ++MBBI;
+ MF.insert(MBBI, LoopBB);
+ MF.insert(MBBI, BodyBB);
+ MF.insert(MBBI, RestoreExecBB);
+ MF.insert(MBBI, RemainderBB);
+
+ LoopBB->addSuccessor(BodyBB);
+ BodyBB->addSuccessor(RestoreExecBB);
+ BodyBB->addSuccessor(LoopBB);
+
+ // Move the rest of the block into a new block.
+ RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
+ RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
+
+ MBB.addSuccessor(LoopBB);
+ RestoreExecBB->addSuccessor(RemainderBB);
+
+ B.setInsertPt(*LoopBB, LoopBB->end());
+
+ // +-MBB:------------+
+ // | ... |
+ // | %0 = G_INST_1 |
+ // | %Dst = MI %Vgpr |
+ // | %1 = G_INST_2 |
+ // | ... |
+ // +-----------------+
+ // ->
+ // +-MBB-------------------------------+
+ // | ... |
+ // | %0 = G_INST_1 |
+ // | %SaveExecReg = S_MOV_B32 $exec_lo |
+ // +----------------|------------------+
+ // | /------------------------------|
+ // V V |
+ // +-LoopBB---------------------------------------------------------------+ |
+ // | %CurrentLaneReg:sgpr(s32) = READFIRSTLANE %Vgpr | |
+ // | instead of executing for each lane, see if other lanes had | |
+ // | same value for %Vgpr and execute for them also. | |
+ // | %CondReg:vcc(s1) = G_ICMP eq %CurrentLaneReg, %Vgpr | |
+ // | %CondRegLM:sreg_32 = ballot %CondReg // copy vcc to sreg32 lane mask | |
+ // | %SavedExec = S_AND_SAVEEXEC_B32 %CondRegLM | |
+ // | exec is active for lanes with the same "CurrentLane value" in Vgpr | |
+ // +----------------|-----------------------------------------------------+ |
+ // V |
+ // +-BodyBB------------------------------------------------------------+ |
+ // | %Dst = MI %CurrentLaneReg:sgpr(s32) | |
+ // | executed only for active lanes and written to Dst | |
+ // | $exec = S_XOR_B32 $exec, %SavedExec | |
+ // | set active lanes to 0 in SavedExec, lanes that did not write to | |
+ // | Dst yet, and set this as new exec (for READFIRSTLANE and ICMP) | |
+ // | SI_WATERFALL_LOOP LoopBB |-----|
+ // +----------------|--------------------------------------------------+
+ // V
+ // +-RestoreExecBB--------------------------+
+ // | $exec_lo = S_MOV_B32_term %SaveExecReg |
+ // +----------------|-----------------------+
+ // V
+ // +-RemainderBB:----------------------+
+ // | %1 = G_INST_2 |
+ // | ... |
+ // +---------------------------------- +
+
+ // Move the instruction into the loop body. Note we moved everything after
+ // Range.end() already into a new block, so Range.end() is no longer valid.
+ BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end());
+
+ // Figure out the iterator range after splicing the instructions.
+ MachineBasicBlock::iterator NewBegin = Range.begin()->getIterator();
+ auto NewEnd = BodyBB->end();
+ assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
+
+ B.setMBB(*LoopBB);
+ Register CondReg;
+
+ for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
+ for (MachineOperand &Op : MI.all_uses()) {
+ Register OldReg = Op.getReg();
+ if (!SGPROperandRegs.count(OldReg))
+ continue;
+
+ // See if we already processed this register in another instruction in
+ // the sequence.
+ auto OldVal = WaterfalledRegMap.find(OldReg);
+ if (OldVal != WaterfalledRegMap.end()) {
+ Op.setReg(OldVal->second);
+ continue;
+ }
+
+ Register OpReg = Op.getReg();
+ LLT OpTy = MRI.getType(OpReg);
+
+ // TODO: support for agpr
+ assert(MRI.getRegBank(OpReg) == VgprRB);
+ Register CurrentLaneReg = MRI.createVirtualRegister({SgprRB, OpTy});
+ buildReadFirstLane(B, CurrentLaneReg, OpReg, RBI);
+
+ // Build the comparison(s), CurrentLaneReg == OpReg.
+ unsigned OpSize = OpTy.getSizeInBits();
+ unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32;
+ LLT PartTy = LLT::scalar(PartSize);
+ unsigned NumParts = OpSize / PartSize;
+ SmallVector<Register, 8> OpParts;
+ SmallVector<Register, 8> CurrentLaneParts;
+
+ if (NumParts == 1) {
+ OpParts.push_back(OpReg);
+ CurrentLaneParts.push_back(CurrentLaneReg);
+ } else {
+ auto UnmergeOp = B.buildUnmerge({VgprRB, PartTy}, OpReg);
+ auto UnmergeCurrLane = B.buildUnmerge({SgprRB, PartTy}, CurrentLaneReg);
+ for (unsigned i = 0; i < NumParts; ++i) {
+ OpParts.push_back(UnmergeOp.getReg(i));
+ CurrentLaneParts.push_back(UnmergeCurrLane.getReg(i));
+ }
+ }
+
+ for (unsigned i = 0; i < NumParts; ++i) {
+ Register CmpReg = MRI.createVirtualRegister(VccRB_S1);
+ B.buildICmp(CmpInst::ICMP_EQ, CmpReg, CurrentLaneParts[i], OpParts[i]);
+
+ if (!CondReg)
+ CondReg = CmpReg;
+ else
+ CondReg = B.buildAnd(VccRB_S1, CondReg, CmpReg).getReg(0);
+ }
+
+ Op.setReg(CurrentLaneReg);
+
+ // Make sure we don't re-process this register again.
+ WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg()));
+ }
+ }
+
+ // Copy vcc to sgpr32/64, ballot becomes a no-op during instruction selection.
+ Register CondRegLM =
+ MRI.createVirtualRegister({WaveRC, LLT::scalar(IsWave32 ? 32 : 64)});
+ B.buildIntrinsic(Intrinsic::amdgcn_ballot, CondRegLM).addReg(CondReg);
+
+ // Update EXEC, save the original EXEC value to SavedExec.
+ B.buildInstr(AndSaveExecOpc)
+ .addDef(SavedExec)
+ .addReg(CondRegLM, RegState::Kill);
+ MRI.setSimpleHint(SavedExec, CondRegLM);
+
+ B.setInsertPt(*BodyBB, BodyBB->end());
+
+ // Update EXEC, switch all done bits to 0 and all todo bits to 1.
+ B.buildInstr(XorTermOpc).addDef(ExecReg).addReg(ExecReg).addReg(SavedExec);
+
+ // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
+ // s_cbranch_scc0?
+
+ // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
+ B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
+
+ // Save the EXEC mask before the loop.
+ B.setInsertPt(MBB, MBB.end());
+ B.buildInstr(MovExecOpc).addDef(SaveExecReg).addReg(ExecReg);
+
+ // Restore the EXEC mask after the loop.
+ B.setInsertPt(*RestoreExecBB, RestoreExecBB->begin());
+ B.buildInstr(MovExecTermOpc).addDef(ExecReg).addReg(SaveExecReg);
+
+ // Set the insert point after the original instruction, so any new
+ // instructions will be in the remainder.
+ B.setInsertPt(*RemainderBB, RemainderBB->begin());
+
+ return true;
+}
+
void RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
ArrayRef<LLT> LLTBreakdown, LLT MergeTy) {
MachineFunction &MF = B.getMF();
@@ -391,7 +609,7 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
switch (Mapping.LoweringMethod) {
case DoNotLower:
- return;
+ break;
case VccExtToSel:
return lowerVccExtToSel(MI);
case UniExtToSel: {
@@ -527,7 +745,10 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
}
}
- // TODO: executeInWaterfallLoop(... WaterfallSgprs)
+ if (!WaterfallSgprs.empty()) {
+ MachineBasicBlock::iterator I = MI.getIterator();
+ executeInWaterfallLoop(B, make_range(I, std::next(I)), WaterfallSgprs);
+ }
}
LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
@@ -539,6 +760,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
case Vgpr16:
return LLT::scalar(16);
case Sgpr32:
+ case Sgpr32_WF:
case Sgpr32Trunc:
case Sgpr32AExt:
case Sgpr32AExtBoolInReg:
@@ -577,6 +799,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
case VgprV2S32:
return LLT::fixed_vector(2, 32);
case SgprV4S32:
+ case SgprV4S32_WF:
case VgprV4S32:
case UniInVgprV4S32:
return LLT::fixed_vector(4, 32);
@@ -650,6 +873,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
return VccRB;
case Sgpr16:
case Sgpr32:
+ case Sgpr32_WF:
case Sgpr64:
case Sgpr128:
case SgprP1:
@@ -662,6 +886,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
case SgprV2S16:
case SgprV2S32:
case SgprV4S32:
+ case SgprV4S32_WF:
case SgprB32:
case SgprB64:
case SgprB96:
@@ -923,6 +1148,14 @@ void RegBankLegalizeHelper::applyMappingSrc(
}
break;
}
+ // sgpr waterfall, scalars and vectors
+ case Sgpr32_WF:
+ case SgprV4S32_WF: {
+ assert(Ty == getTyFromID(MethodIDs[i]));
+ if (RB != SgprRB)
+ SgprWaterfallOperandRegs.insert(Reg);
+ break;
+ }
// sgpr and vgpr scalars with extend
case Sgpr32AExt: {
// Note: this ext allows S1, and it is meant to be combined away.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
index 08cc7d4..db965d8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
@@ -32,6 +32,7 @@ class RegBankLegalizeHelper {
const MachineUniformityInfo &MUI;
const RegisterBankInfo &RBI;
const RegBankLegalizeRules &RBLRules;
+ const bool IsWave32;
const RegisterBank *SgprRB;
const RegisterBank *VgprRB;
const RegisterBank *VccRB;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index a60855c..5a6ad40 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -529,7 +529,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
addRulesForGOpcs({G_ICMP})
.Any({{UniS1, _, S32}, {{Sgpr32Trunc}, {None, Sgpr32, Sgpr32}}})
- .Any({{DivS1, _, S32}, {{Vcc}, {None, Vgpr32, Vgpr32}}});
+ .Any({{DivS1, _, S32}, {{Vcc}, {None, Vgpr32, Vgpr32}}})
+ .Any({{DivS1, _, S64}, {{Vcc}, {None, Vgpr64, Vgpr64}}});
addRulesForGOpcs({G_FCMP})
.Any({{UniS1, _, S32}, {{UniInVcc}, {None, Vgpr32, Vgpr32}}})
@@ -666,11 +667,15 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
// clang-format off
addRulesForGOpcs({G_LOAD})
.Any({{DivB32, DivP0}, {{VgprB32}, {VgprP0}}})
+ .Any({{DivB32, UniP0}, {{VgprB32}, {VgprP0}}})
.Any({{DivB32, DivP1}, {{VgprB32}, {VgprP1}}})
.Any({{{UniB256, UniP1}, isAlign4 && isUL}, {{SgprB256}, {SgprP1}}})
.Any({{{UniB512, UniP1}, isAlign4 && isUL}, {{SgprB512}, {SgprP1}}})
.Any({{{UniB32, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB32}, {SgprP1}}})
+ .Any({{{UniB64, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB64}, {SgprP1}}})
+ .Any({{{UniB96, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB96}, {SgprP1}}})
+ .Any({{{UniB128, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB128}, {SgprP1}}})
.Any({{{UniB256, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB256}, {VgprP1}, SplitLoad}})
.Any({{{UniB512, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB512}, {VgprP1}, SplitLoad}})
@@ -684,6 +689,7 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Any({{{UniB96, UniP4}, isAlign16 && isUL}, {{SgprB96}, {SgprP4}, WidenLoad}}, !hasUnalignedLoads)
.Any({{{UniB96, UniP4}, isAlign4 && !isAlign16 && isUL}, {{SgprB96}, {SgprP4}, SplitLoad}}, !hasUnalignedLoads)
.Any({{{UniB96, UniP4}, isAlign4 && isUL}, {{SgprB96}, {SgprP4}}}, hasUnalignedLoads)
+ .Any({{{UniB128, UniP4}, isAlign4 && isUL}, {{SgprB128}, {SgprP4}}})
.Any({{{UniB256, UniP4}, isAlign4 && isUL}, {{SgprB256}, {SgprP4}}})
.Any({{{UniB512, UniP4}, isAlign4 && isUL}, {{SgprB512}, {SgprP4}}})
.Any({{{UniB32, UniP4}, !isNaturalAlignedSmall || !isUL}, {{UniInVgprB32}, {VgprP4}}}, hasSMRDSmall) // i8 and i16 load
@@ -698,11 +704,15 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Any({{{UniB32, UniP4}, !isAlign4 || !isUL}, {{UniInVgprB32}, {VgprP4}}});
// clang-format on
- addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD}, Vector)
- .Div(S32, {{Vgpr32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}})
- .Uni(S32, {{UniInVgprS32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}})
- .Div(V4S32, {{VgprV4S32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}})
- .Uni(V4S32, {{UniInVgprV4S32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}});
+ addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD}, StandardB)
+ .Div(B32, {{VgprB32}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+ .Uni(B32, {{UniInVgprB32}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+ .Div(B64, {{VgprB64}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+ .Uni(B64, {{UniInVgprB64}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+ .Div(B96, {{VgprB96}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+ .Uni(B96, {{UniInVgprB96}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+ .Div(B128, {{VgprB128}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+ .Uni(B128, {{UniInVgprB128}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}});
addRulesForGOpcs({G_STORE})
.Any({{S32, P0}, {{}, {Vgpr32, VgprP0}}})
@@ -716,7 +726,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
addRulesForGOpcs({G_PTR_ADD})
.Any({{UniP1}, {{SgprP1}, {SgprP1, Sgpr64}}})
.Any({{DivP1}, {{VgprP1}, {VgprP1, Vgpr64}}})
- .Any({{DivP0}, {{VgprP0}, {VgprP0, Vgpr64}}});
+ .Any({{DivP0}, {{VgprP0}, {VgprP0, Vgpr64}}})
+ .Any({{UniP4}, {{SgprP4}, {SgprP4, Sgpr64}}});
addRulesForGOpcs({G_INTTOPTR})
.Any({{UniPtr32}, {{SgprPtr32}, {Sgpr32}}})
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
index 7243d75..1391440 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
@@ -188,7 +188,11 @@ enum RegBankLLTMappingApplyID {
Sgpr32Trunc,
- // Src only modifiers: waterfalls, extends
+ // Src only modifiers: execute in waterfall loop if divergent
+ Sgpr32_WF,
+ SgprV4S32_WF,
+
+ // Src only modifiers: extends
Sgpr32AExt,
Sgpr32AExtBoolInReg,
Sgpr32SExt,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index f1caf24..c5a1d9e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -2528,7 +2528,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
// Special case for s_mul_u64. There is not a vector equivalent of
// s_mul_u64. Hence, we have to break down s_mul_u64 into 32-bit vector
// multiplications.
- if (Opc == AMDGPU::G_MUL && DstTy.getSizeInBits() == 64) {
+ if (!Subtarget.hasVectorMulU64() && Opc == AMDGPU::G_MUL &&
+ DstTy.getSizeInBits() == 64) {
applyMappingSMULU64(B, OpdMapper);
return;
}
@@ -3500,19 +3501,24 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
applyMappingMAD_64_32(B, OpdMapper);
return;
case AMDGPU::G_PREFETCH: {
- if (!Subtarget.hasPrefetch() || !Subtarget.hasSafeSmemPrefetch()) {
+ if (!Subtarget.hasSafeSmemPrefetch() && !Subtarget.hasVmemPrefInsts()) {
MI.eraseFromParent();
return;
}
Register PtrReg = MI.getOperand(0).getReg();
unsigned PtrBank = getRegBankID(PtrReg, MRI, AMDGPU::SGPRRegBankID);
- if (PtrBank == AMDGPU::VGPRRegBankID) {
+ if (PtrBank == AMDGPU::VGPRRegBankID &&
+ (!Subtarget.hasVmemPrefInsts() || !MI.getOperand(3).getImm())) {
+ // Cannot do I$ prefetch with divergent pointer.
MI.eraseFromParent();
return;
}
unsigned AS = MRI.getType(PtrReg).getAddressSpace();
- if (!AMDGPU::isFlatGlobalAddrSpace(AS) &&
- AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
+ if ((!AMDGPU::isFlatGlobalAddrSpace(AS) &&
+ AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
+ (!Subtarget.hasSafeSmemPrefetch() &&
+ (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
+ !MI.getOperand(3).getImm() /* I$ prefetch */))) {
MI.eraseFromParent();
return;
}
@@ -3973,7 +3979,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
} else {
- OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
+ if (MI.getOpcode() == AMDGPU::G_MUL && Subtarget.hasVectorMulU64())
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+ else
+ OpdsMapping[0] =
+ getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/);
OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
@@ -5170,6 +5180,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_ds_load_tr16_b128:
case Intrinsic::amdgcn_ds_load_tr4_b64:
case Intrinsic::amdgcn_ds_load_tr6_b96:
+ case Intrinsic::amdgcn_flat_load_monitor_b32:
+ case Intrinsic::amdgcn_flat_load_monitor_b64:
+ case Intrinsic::amdgcn_flat_load_monitor_b128:
+ case Intrinsic::amdgcn_global_load_monitor_b32:
+ case Intrinsic::amdgcn_global_load_monitor_b64:
+ case Intrinsic::amdgcn_global_load_monitor_b128:
case Intrinsic::amdgcn_ds_read_tr4_b64:
case Intrinsic::amdgcn_ds_read_tr6_b96:
case Intrinsic::amdgcn_ds_read_tr8_b64:
@@ -5432,6 +5448,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
break;
}
+ case Intrinsic::amdgcn_flat_prefetch:
+ case Intrinsic::amdgcn_global_prefetch:
+ return getDefaultMappingVOP(MI);
default:
return getInvalidInstructionMapping();
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
index a8e1967..f580f43 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
@@ -159,7 +159,8 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
// If the inputs are tied and the same register, we can shortcut and
// directly replace the register.
- if (Src2->getReg() != CopySrcReg) {
+ if (!Src2->isReg() || Src2->getReg() != CopySrcReg ||
+ Src2->getSubReg() != DefMI->getOperand(1).getSubReg()) {
LLVM_DEBUG(
dbgs()
<< "Replacing untied VGPR MFMAs with AGPR form not yet handled\n");
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 1e44be8..6878744 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -61,6 +61,7 @@ protected:
bool EnableRealTrue16Insts = false;
bool HasBF16TransInsts = false;
bool HasBF16ConversionInsts = false;
+ bool HasBF16PackedInsts = false;
bool HasMadMixInsts = false;
bool HasMadMacF32Insts = false;
bool HasDsSrc2Insts = false;
@@ -209,6 +210,8 @@ public:
return HasBF16ConversionInsts;
}
+ bool hasBF16PackedInsts() const { return HasBF16PackedInsts; }
+
bool hasMadMixInsts() const {
return HasMadMixInsts;
}
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 679c55d..7207c25 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -13,6 +13,7 @@ let WantsRoot = true in {
def GlobalSAddr : ComplexPattern<iPTR, 4, "SelectGlobalSAddr", [], [], -10>;
def GlobalSAddrGLC : ComplexPattern<iPTR, 4, "SelectGlobalSAddrGLC", [], [], -10>;
+ def GlobalSAddrCPol : ComplexPattern<iPTR, 4, "SelectGlobalSAddrCPol", [], [], -10>;
def ScratchSAddr : ComplexPattern<iPTR, 2, "SelectScratchSAddr", [], [], -10>;
def ScratchSVAddr : ComplexPattern<iPTR, 4, "SelectScratchSVAddr", [], [], -10>;
}
@@ -464,6 +465,37 @@ class FLAT_Global_Invalidate_Writeback<string opName, SDPatternOperator node = n
let sve = 0;
}
+class FLAT_Prefetch_Pseudo<string opName, dag addr = (ins VReg_64:$vaddr), string asm = " $vaddr"> :
+ FLAT_Pseudo<opName, (outs), !con(addr, (ins flat_offset:$offset, CPol_0:$cpol)), asm#"$offset$cpol"> {
+ let has_vdst = 0;
+ let has_data = 0;
+ let mayLoad = 1;
+ let mayStore = 1;
+ let VM_CNT = 0;
+ let LGKM_CNT = 0;
+}
+
+multiclass FLAT_Flat_Prefetch_Pseudo<string opName> {
+ def "" : FLAT_Prefetch_Pseudo<opName>,
+ GlobalSaddrTable<0, opName>;
+ def _SADDR : FLAT_Prefetch_Pseudo<opName, (ins SReg_64:$saddr, VGPR_32:$vaddr), " $vaddr, $saddr">,
+ GlobalSaddrTable<1, opName> {
+ let OtherPredicates = [HasFlatGVSMode];
+ let enabled_saddr = 1;
+ }
+}
+
+multiclass FLAT_Global_Prefetch_Pseudo<string opName> {
+ let is_flat_global = 1, has_saddr = 1 in {
+ def "" : FLAT_Prefetch_Pseudo<opName, (ins VReg_64:$vaddr), " $vaddr, off">,
+ GlobalSaddrTable<0, opName>;
+ def _SADDR : FLAT_Prefetch_Pseudo<opName, (ins SReg_64:$saddr, VGPR_32:$vaddr), " $vaddr, $saddr">,
+ GlobalSaddrTable<1, opName> {
+ let enabled_saddr = 1;
+ }
+ }
+}
+
class FlatScratchInst <string sv_op, string mode> {
string SVOp = sv_op;
string Mode = mode;
@@ -1162,6 +1194,16 @@ defm SCRATCH_LOAD_LDS_USHORT : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_u
defm SCRATCH_LOAD_LDS_SSHORT : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_sshort">;
defm SCRATCH_LOAD_LDS_DWORD : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_dword">;
+let SubtargetPredicate = isGFX125xOnly in {
+defm FLAT_LOAD_MONITOR_B32 : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b32", VGPR_32>;
+defm FLAT_LOAD_MONITOR_B64 : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b64", VReg_64>;
+defm FLAT_LOAD_MONITOR_B128 : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b128", VReg_128>;
+
+defm GLOBAL_LOAD_MONITOR_B32 : FLAT_Global_Load_Pseudo <"global_load_monitor_b32", VGPR_32>;
+defm GLOBAL_LOAD_MONITOR_B64 : FLAT_Global_Load_Pseudo <"global_load_monitor_b64", VReg_64>;
+defm GLOBAL_LOAD_MONITOR_B128 : FLAT_Global_Load_Pseudo <"global_load_monitor_b128", VReg_128>;
+} // End SubtargetPredicate = isGFX125xOnly
+
let SubtargetPredicate = isGFX12Plus in {
let Uses = [EXEC, M0] in {
defm GLOBAL_LOAD_BLOCK : FLAT_Global_Load_Pseudo <"global_load_block", VReg_1024>;
@@ -1218,6 +1260,11 @@ let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in
"global_atomic_pk_add_f16", VGPR_32, v2f16
>;
+let SubtargetPredicate = HasVmemPrefInsts in {
+ defm FLAT_PREFETCH_B8 : FLAT_Flat_Prefetch_Pseudo<"flat_prefetch_b8">;
+ defm GLOBAL_PREFETCH_B8 : FLAT_Global_Prefetch_Pseudo<"global_prefetch_b8">;
+}
+
//===----------------------------------------------------------------------===//
// Flat Patterns
//===----------------------------------------------------------------------===//
@@ -1228,6 +1275,11 @@ class FlatLoadPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCN
(inst $vaddr, $offset)
>;
+class FlatLoadPat_CPOL <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (FlatOffset i64:$vaddr, i32:$offset), (i32 timm:$cpol))),
+ (inst $vaddr, $offset, $cpol)
+>;
+
class FlatLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(node (FlatOffset (i64 VReg_64:$vaddr), i32:$offset), vt:$in),
(inst $vaddr, $offset, 0, $in)
@@ -1249,8 +1301,8 @@ class FlatSignedLoadPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Value
>;
class GlobalLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), vt:$in)),
- (inst $saddr, $voffset, $offset, 0, $in)
+ (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), vt:$in)),
+ (inst $saddr, $voffset, $offset, $cpol, $in)
>;
class FlatLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
@@ -1264,8 +1316,8 @@ class FlatLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueT
>;
class GlobalLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))),
- (inst $saddr, $voffset, $offset, (i32 0))
+ (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))),
+ (inst $saddr, $voffset, $offset, $cpol)
>;
class FlatLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
@@ -1278,6 +1330,16 @@ class FlatLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt>
(inst $saddr, $voffset, $offset, $cpol)
>;
+class FlatLoadSignedPat_CPOL <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), (i32 timm:$cpol))),
+ (inst $vaddr, $offset, $cpol)
+>;
+
+class GlobalLoadSaddrPat_CPOL <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (GlobalSAddrCPol (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), (i32 timm))),
+ (inst $saddr, $voffset, $offset, $cpol)
+>;
+
class FlatStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
ValueType vt> : GCNPat <
(node vt:$data, (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol)),
@@ -1459,8 +1521,8 @@ class ScratchLoadSVaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueT
>;
class ScratchLoadSVaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset))),
- (inst $vaddr, $saddr, $offset, 0)
+ (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol))),
+ (inst $vaddr, $saddr, $offset, $cpol)
>;
multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
@@ -1473,6 +1535,16 @@ multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueTyp
}
}
+multiclass GlobalFLATLoadPats_CPOL<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+ def : FlatLoadSignedPat_CPOL<inst, node, vt> {
+ let AddedComplexity = 10;
+ }
+
+ def : GlobalLoadSaddrPat_CPOL<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ let AddedComplexity = 11;
+ }
+}
+
multiclass GlobalFLATLoadPats_D16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
def : FlatSignedLoadPat_D16 <inst, node, vt> {
let AddedComplexity = 10;
@@ -2009,6 +2081,16 @@ let WaveSizePredicate = isWave32, OtherPredicates = [HasTransposeLoadF4F6Insts]
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR6_B96, int_amdgcn_global_load_tr6_b96, v3i32>;
}
+let OtherPredicates = [isGFX125xOnly] in {
+ def : FlatLoadPat_CPOL <FLAT_LOAD_MONITOR_B32, int_amdgcn_flat_load_monitor_b32, i32>;
+ def : FlatLoadPat_CPOL <FLAT_LOAD_MONITOR_B64, int_amdgcn_flat_load_monitor_b64, v2i32>;
+ def : FlatLoadPat_CPOL <FLAT_LOAD_MONITOR_B128, int_amdgcn_flat_load_monitor_b128, v4i32>;
+
+ defm : GlobalFLATLoadPats_CPOL <GLOBAL_LOAD_MONITOR_B32, int_amdgcn_global_load_monitor_b32, i32>;
+ defm : GlobalFLATLoadPats_CPOL <GLOBAL_LOAD_MONITOR_B64, int_amdgcn_global_load_monitor_b64, v2i32>;
+ defm : GlobalFLATLoadPats_CPOL <GLOBAL_LOAD_MONITOR_B128, int_amdgcn_global_load_monitor_b128, v4i32>;
+} // End SubtargetPredicate = isGFX125xOnly
+
let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in {
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>;
@@ -2138,6 +2220,77 @@ defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SHORT_D16, load_d16_lo_private, v2f
} // End OtherPredicates = [HasFlatScratchInsts,EnableFlatScratch]
+def PrefetchLoc: SDNodeXForm<timm, [{
+ uint32_t V = N->getZExtValue();
+ V = (AMDGPU::CPol::SCOPE_MASK - (V & AMDGPU::CPol::SCOPE_MASK)) << AMDGPU::CPol::SCOPE_SHIFT;
+ if (!Subtarget->hasSafeCUPrefetch())
+ V = std::max(V, (uint32_t)AMDGPU::CPol::SCOPE_SE); // CU scope is unsafe
+ return CurDAG->getTargetConstant(V, SDLoc(N), MVT::i32);
+}]>;
+
+def prefetch_flat : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type),
+ (prefetch node:$ptr, node:$rw, node:$loc, node:$type),
+ [{ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS; }]> {
+ let GISelPredicateCode = [{
+ return (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
+ }];
+}
+
+def prefetch_global : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type),
+ (prefetch node:$ptr, node:$rw, node:$loc, node:$type),
+ [{ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
+ (cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
+ !Subtarget->hasSafeSmemPrefetch()); }]> {
+ let GISelPredicateCode = [{
+ return (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
+ ((*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
+ !Subtarget->hasSafeSmemPrefetch());
+ }];
+}
+
+multiclass FlatPrefetchPats<string inst, SDPatternOperator prefetch_kind, SDPatternOperator rw> {
+ def : GCNPat <
+ (prefetch_kind (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), rw, (i32 timm:$loc), i32imm_one),
+ (!cast<FLAT_Pseudo>(inst) $vaddr, $offset, (i32 (PrefetchLoc $loc)))
+ > {
+ let AddedComplexity = !if(!eq(rw, i32imm_zero), 0, 25);
+ }
+
+ def : GCNPat <
+ (prefetch_kind (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), rw, (i32 timm:$loc), i32imm_one),
+ (!cast<FLAT_Pseudo>(inst#"_SADDR") $saddr, $voffset, $offset, (i32 (PrefetchLoc $loc)))
+ > {
+ let AddedComplexity = !if(!eq(rw, i32imm_zero), 11, 30);
+ }
+}
+
+multiclass FlatIntrPrefetchPats<string inst, SDPatternOperator intr> {
+ def : GCNPat <
+ (intr (FlatOffset i64:$vaddr, i32:$offset), timm:$cpol),
+ (!cast<FLAT_Pseudo>(inst) $vaddr, $offset, $cpol)
+ >;
+
+ def : GCNPat <
+ (intr (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), timm:$cpol),
+ (!cast<FLAT_Pseudo>(inst#"_SADDR") $saddr, $voffset, $offset, $cpol)> {
+ let AddedComplexity = 11;
+ }
+}
+
+let SubtargetPredicate = HasVmemPrefInsts in {
+ defm : FlatPrefetchPats<"FLAT_PREFETCH_B8", prefetch_flat, i32imm_zero>;
+ defm : FlatPrefetchPats<"GLOBAL_PREFETCH_B8", prefetch_global, i32imm_zero>;
+
+ // Patterns for forced vector prefetch with rw = 1.
+ defm : FlatPrefetchPats<"FLAT_PREFETCH_B8", prefetch_flat, i32imm_one>;
+ defm : FlatPrefetchPats<"GLOBAL_PREFETCH_B8", prefetch_global, i32imm_one>;
+
+
+ // Patterns for target intrinsics
+ defm : FlatIntrPrefetchPats<"FLAT_PREFETCH_B8", int_amdgcn_flat_prefetch>;
+ defm : FlatIntrPrefetchPats<"GLOBAL_PREFETCH_B8", int_amdgcn_global_prefetch>;
+} // End SubtargetPredicate = HasVmemPrefInsts
+
//===----------------------------------------------------------------------===//
// Target
//===----------------------------------------------------------------------===//
@@ -3210,6 +3363,17 @@ multiclass VFLAT_Real_Atomics_gfx1250<bits<8> op, string name = get_FLAT_ps<NAME
defm TENSOR_SAVE : VFLAT_Real_gfx1250<0x06e>;
defm TENSOR_STOP : VFLAT_Real_gfx1250<0x06f>;
+defm FLAT_PREFETCH_B8 : VFLAT_Real_AllAddr_gfx1250<0x05d>;
+defm GLOBAL_PREFETCH_B8 : VFLAT_Real_AllAddr_gfx1250<0x05d>;
+
+defm FLAT_LOAD_MONITOR_B32 : VFLAT_Real_AllAddr_gfx1250<0x070>;
+defm FLAT_LOAD_MONITOR_B64 : VFLAT_Real_AllAddr_gfx1250<0x071>;
+defm FLAT_LOAD_MONITOR_B128 : VFLAT_Real_AllAddr_gfx1250<0x072>;
+
+defm GLOBAL_LOAD_MONITOR_B32 : VFLAT_Real_AllAddr_gfx1250<0x070>;
+defm GLOBAL_LOAD_MONITOR_B64 : VFLAT_Real_AllAddr_gfx1250<0x071>;
+defm GLOBAL_LOAD_MONITOR_B128 : VFLAT_Real_AllAddr_gfx1250<0x072>;
+
defm GLOBAL_LOAD_TR_B128_w32 : VFLAT_Real_AllAddr_gfx1250<0x057, "global_load_tr16_b128">;
defm GLOBAL_LOAD_TR_B64_w32 : VFLAT_Real_AllAddr_gfx1250<0x058, "global_load_tr8_b64">;
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 7d6723a..334afd3 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -38,7 +38,11 @@ bool llvm::isEqual(const GCNRPTracker::LiveRegSet &S1,
unsigned GCNRegPressure::getRegKind(const TargetRegisterClass *RC,
const SIRegisterInfo *STI) {
- return STI->isSGPRClass(RC) ? SGPR : (STI->isAGPRClass(RC) ? AGPR : VGPR);
+ return STI->isSGPRClass(RC)
+ ? SGPR
+ : (STI->isAGPRClass(RC)
+ ? AGPR
+ : (STI->isVectorSuperClass(RC) ? AVGPR : VGPR));
}
void GCNRegPressure::inc(unsigned Reg,
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 3749b6d..ea33a22 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -29,43 +29,57 @@ class raw_ostream;
class SlotIndex;
struct GCNRegPressure {
- enum RegKind { SGPR, VGPR, AGPR, TOTAL_KINDS };
+ enum RegKind { SGPR, VGPR, AGPR, AVGPR, TOTAL_KINDS };
GCNRegPressure() {
clear();
}
- bool empty() const { return !Value[SGPR] && !Value[VGPR] && !Value[AGPR]; }
+ bool empty() const {
+ return !Value[SGPR] && !Value[VGPR] && !Value[AGPR] && !Value[AVGPR];
+ }
void clear() { std::fill(&Value[0], &Value[ValueArraySize], 0); }
/// \returns the SGPR32 pressure
unsigned getSGPRNum() const { return Value[SGPR]; }
- /// \returns the aggregated ArchVGPR32, AccVGPR32 pressure dependent upon \p
- /// UnifiedVGPRFile
+ /// \returns the aggregated ArchVGPR32, AccVGPR32, and Pseudo AVGPR pressure
+ /// dependent upon \p UnifiedVGPRFile
unsigned getVGPRNum(bool UnifiedVGPRFile) const {
if (UnifiedVGPRFile) {
- return Value[AGPR] ? getUnifiedVGPRNum(Value[VGPR], Value[AGPR])
- : Value[VGPR];
+ return Value[AGPR]
+ ? getUnifiedVGPRNum(Value[VGPR], Value[AGPR], Value[AVGPR])
+ : Value[VGPR] + Value[AVGPR];
}
- return std::max(Value[VGPR], Value[AGPR]);
+ // AVGPR assignment priority is based on the width of the register. Account
+ // AVGPR pressure as VGPR.
+ return std::max(Value[VGPR] + Value[AVGPR], Value[AGPR]);
}
/// Returns the aggregated VGPR pressure, assuming \p NumArchVGPRs ArchVGPRs
- /// and \p NumAGPRs AGPRS, for a target with a unified VGPR file.
+ /// \p NumAGPRs AGPRS, and \p NumAVGPRs AVGPRs for a target with a unified
+ /// VGPR file.
inline static unsigned getUnifiedVGPRNum(unsigned NumArchVGPRs,
- unsigned NumAGPRs) {
- return alignTo(NumArchVGPRs, AMDGPU::IsaInfo::getArchVGPRAllocGranule()) +
+ unsigned NumAGPRs,
+ unsigned NumAVGPRs) {
+
+ // Assume AVGPRs will be assigned as VGPRs.
+ return alignTo(NumArchVGPRs + NumAVGPRs,
+ AMDGPU::IsaInfo::getArchVGPRAllocGranule()) +
NumAGPRs;
}
- /// \returns the ArchVGPR32 pressure
- unsigned getArchVGPRNum() const { return Value[VGPR]; }
+ /// \returns the ArchVGPR32 pressure, plus the AVGPRS which we assume will be
+ /// allocated as VGPR
+ unsigned getArchVGPRNum() const { return Value[VGPR] + Value[AVGPR]; }
/// \returns the AccVGPR32 pressure
unsigned getAGPRNum() const { return Value[AGPR]; }
+ /// \returns the AVGPR32 pressure
+ unsigned getAVGPRNum() const { return Value[AVGPR]; }
unsigned getVGPRTuplesWeight() const {
- return std::max(Value[TOTAL_KINDS + VGPR], Value[TOTAL_KINDS + AGPR]);
+ return std::max(Value[TOTAL_KINDS + VGPR] + Value[TOTAL_KINDS + AVGPR],
+ Value[TOTAL_KINDS + AGPR]);
}
unsigned getSGPRTuplesWeight() const { return Value[TOTAL_KINDS + SGPR]; }
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index a655308..ce1ce68 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1911,14 +1911,12 @@ void PreRARematStage::rematerialize() {
for (auto &[DefMI, Remat] : Rematerializations) {
MachineBasicBlock::iterator InsertPos(Remat.UseMI);
Register Reg = DefMI->getOperand(0).getReg();
- unsigned SubReg = DefMI->getOperand(0).getSubReg();
unsigned DefRegion = MIRegion.at(DefMI);
// Rematerialize DefMI to its use block.
- TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg, SubReg, *DefMI,
- *DAG.TRI);
+ TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
+ AMDGPU::NoSubRegister, *DefMI, *DAG.TRI);
Remat.RematMI = &*std::prev(InsertPos);
- Remat.RematMI->getOperand(0).setSubReg(SubReg);
DAG.LIS->InsertMachineInstrInMaps(*Remat.RematMI);
// Update region boundaries in regions we sinked from (remove defining MI)
@@ -2064,14 +2062,13 @@ void PreRARematStage::finalizeGCNSchedStage() {
MachineBasicBlock::iterator InsertPos(DAG.Regions[DefRegion].second);
MachineBasicBlock *MBB = RegionBB[DefRegion];
Register Reg = RematMI.getOperand(0).getReg();
- unsigned SubReg = RematMI.getOperand(0).getSubReg();
// Re-rematerialize MI at the end of its original region. Note that it may
// not be rematerialized exactly in the same position as originally within
// the region, but it should not matter much.
- TII->reMaterialize(*MBB, InsertPos, Reg, SubReg, RematMI, *DAG.TRI);
+ TII->reMaterialize(*MBB, InsertPos, Reg, AMDGPU::NoSubRegister, RematMI,
+ *DAG.TRI);
MachineInstr *NewMI = &*std::prev(InsertPos);
- NewMI->getOperand(0).setSubReg(SubReg);
DAG.LIS->InsertMachineInstrInMaps(*NewMI);
auto UseRegion = MIRegion.find(Remat.UseMI);
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 8b758b0..88a269f 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -123,6 +123,7 @@ protected:
bool HasSMemRealTime = false;
bool HasIntClamp = false;
bool HasFmaMixInsts = false;
+ bool HasFmaMixBF16Insts = false;
bool HasMovrel = false;
bool HasVGPRIndexMode = false;
bool HasScalarDwordx3Loads = false;
@@ -244,7 +245,9 @@ protected:
bool HasVMEMtoScalarWriteHazard = false;
bool HasSMEMtoVectorWriteHazard = false;
bool HasInstFwdPrefetchBug = false;
+ bool HasVmemPrefInsts = false;
bool HasSafeSmemPrefetch = false;
+ bool HasSafeCUPrefetch = false;
bool HasVcmpxExecWARHazard = false;
bool HasLdsBranchVmemWARHazard = false;
bool HasNSAtoVMEMBug = false;
@@ -265,8 +268,10 @@ protected:
bool HasIEEEMinimumMaximumInsts = false;
bool HasMinimum3Maximum3F32 = false;
bool HasMinimum3Maximum3F16 = false;
+ bool HasMin3Max3PKF16 = false;
bool HasMinimum3Maximum3PKF16 = false;
bool HasLshlAddU64Inst = false;
+ bool HasAddSubU64Insts = false;
bool HasPointSampleAccel = false;
bool HasLdsBarrierArriveAtomic = false;
bool HasSetPrioIncWgInst = false;
@@ -460,6 +465,8 @@ public:
return HasFmaMixInsts;
}
+ bool hasFmaMixBF16Insts() const { return HasFmaMixBF16Insts; }
+
bool hasCARRY() const {
return true;
}
@@ -985,8 +992,12 @@ public:
bool hasPrefetch() const { return GFX12Insts; }
+ bool hasVmemPrefInsts() const { return HasVmemPrefInsts; }
+
bool hasSafeSmemPrefetch() const { return HasSafeSmemPrefetch; }
+ bool hasSafeCUPrefetch() const { return HasSafeCUPrefetch; }
+
// Has s_cmpk_* instructions.
bool hasSCmpK() const { return getGeneration() < GFX12; }
@@ -1306,7 +1317,7 @@ public:
bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
- bool hasVALUReadSGPRHazard() const { return getGeneration() == GFX12; }
+ bool hasVALUReadSGPRHazard() const { return GFX12Insts && !GFX1250Insts; }
/// Return if operations acting on VGPR tuples require even alignment.
bool needsAlignedVGPRs() const { return GFX90AInsts || GFX1250Insts; }
@@ -1387,6 +1398,8 @@ public:
return HasMinimum3Maximum3F16;
}
+ bool hasMin3Max3PKF16() const { return HasMin3Max3PKF16; }
+
bool hasTanhInsts() const { return HasTanhInsts; }
bool hasAddPC64Inst() const { return GFX1250Insts; }
@@ -1500,6 +1513,12 @@ public:
bool hasVOPD3() const { return GFX1250Insts; }
+ // \returns true if the target has V_ADD_U64/V_SUB_U64 instructions.
+ bool hasAddSubU64Insts() const { return HasAddSubU64Insts; }
+
+ // \returns true if the target has V_MUL_U64/V_MUL_I64 instructions.
+ bool hasVectorMulU64() const { return GFX1250Insts; }
+
// \returns true if the target has V_PK_ADD_{MIN|MAX}_{I|U}16 instructions.
bool hasPkAddMinMaxInsts() const { return GFX1250Insts; }
diff --git a/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
index 429ce0e0..a33dbfa 100644
--- a/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
@@ -270,5 +270,6 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
MI.eraseFromParent();
}
}
+ finalizeBundles(MF);
return false;
}
diff --git a/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp b/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp
index 2a3b42e..eff5b0a 100644
--- a/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp
@@ -138,7 +138,6 @@ void R600PassConfig::addPreSched2() {
void R600PassConfig::addPreEmitPass() {
addPass(createR600MachineCFGStructurizerPass());
addPass(createR600ExpandSpecialInstrsPass());
- addPass(&FinalizeMachineBundlesID);
addPass(createR600Packetizer());
addPass(createR600ControlFlowFinalizer());
}
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 3902d4c..40b8bcd 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -392,11 +392,13 @@ enum CPol {
TH_ATOMIC_CASCADE = 4, // Cascading vs regular
// Scope
- SCOPE = 0x3 << 3, // All Scope bits
- SCOPE_CU = 0 << 3,
- SCOPE_SE = 1 << 3,
- SCOPE_DEV = 2 << 3,
- SCOPE_SYS = 3 << 3,
+ SCOPE_SHIFT = 3,
+ SCOPE_MASK = 0x3,
+ SCOPE = SCOPE_MASK << SCOPE_SHIFT, // All Scope bits
+ SCOPE_CU = 0 << SCOPE_SHIFT,
+ SCOPE_SE = 1 << SCOPE_SHIFT,
+ SCOPE_DEV = 2 << SCOPE_SHIFT,
+ SCOPE_SYS = 3 << SCOPE_SHIFT,
NV = 1 << 5, // Non-volatile bit
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index e5d1eaa..b77da4d 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1062,9 +1062,13 @@ bool SIFoldOperandsImpl::tryFoldRegSeqSplat(
switch (OpTy) {
case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
+ case AMDGPU::OPERAND_REG_INLINE_C_INT32:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP32:
OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0);
break;
case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+ case AMDGPU::OPERAND_REG_INLINE_C_INT64:
OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0_sub1);
break;
default:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index d65c3ae..8d51ec6 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -874,13 +874,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction({ISD::SMULO, ISD::UMULO}, MVT::i64, Custom);
- if (Subtarget->hasScalarSMulU64())
+ if (Subtarget->hasVectorMulU64())
+ setOperationAction(ISD::MUL, MVT::i64, Legal);
+ else if (Subtarget->hasScalarSMulU64())
setOperationAction(ISD::MUL, MVT::i64, Custom);
if (Subtarget->hasMad64_32())
setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom);
- if (Subtarget->hasPrefetch() && Subtarget->hasSafeSmemPrefetch())
+ if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
if (Subtarget->hasIEEEMinimumMaximumInsts()) {
@@ -944,6 +946,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Legal);
}
+ if (Subtarget->hasBF16PackedInsts()) {
+ setOperationAction(
+ {ISD::FADD, ISD::FMUL, ISD::FMINNUM, ISD::FMAXNUM, ISD::FMA},
+ MVT::v2bf16, Legal);
+ }
+
if (Subtarget->hasBF16TransInsts()) {
setOperationAction({ISD::FEXP2, ISD::FLOG2, ISD::FSQRT}, MVT::bf16, Legal);
}
@@ -1053,10 +1061,12 @@ ArrayRef<MCPhysReg> SITargetLowering::getRoundingControlRegisters() const {
// where this is OK to use.
bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
EVT DestVT, EVT SrcVT) const {
- return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
- (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
- DestVT.getScalarType() == MVT::f32 &&
- SrcVT.getScalarType() == MVT::f16 &&
+ return DestVT.getScalarType() == MVT::f32 &&
+ ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
+ (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
+ SrcVT.getScalarType() == MVT::f16) ||
+ (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
+ SrcVT.getScalarType() == MVT::bf16)) &&
// TODO: This probably only requires no input flushing?
denormalModeIsFlushAllF32(DAG.getMachineFunction());
}
@@ -1467,6 +1477,12 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
MachineMemOperand::MOVolatile;
return true;
}
+ case Intrinsic::amdgcn_flat_load_monitor_b32:
+ case Intrinsic::amdgcn_flat_load_monitor_b64:
+ case Intrinsic::amdgcn_flat_load_monitor_b128:
+ case Intrinsic::amdgcn_global_load_monitor_b32:
+ case Intrinsic::amdgcn_global_load_monitor_b64:
+ case Intrinsic::amdgcn_global_load_monitor_b128:
case Intrinsic::amdgcn_ds_load_tr6_b96:
case Intrinsic::amdgcn_ds_load_tr4_b64:
case Intrinsic::amdgcn_ds_load_tr8_b64:
@@ -1540,7 +1556,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
return true;
}
- case Intrinsic::amdgcn_s_prefetch_data: {
+ case Intrinsic::amdgcn_s_prefetch_data:
+ case Intrinsic::amdgcn_flat_prefetch:
+ case Intrinsic::amdgcn_global_prefetch: {
Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
Info.ptrVal = CI.getArgOperand(0);
@@ -1591,10 +1609,16 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
case Intrinsic::amdgcn_flat_atomic_fmax_num:
case Intrinsic::amdgcn_flat_atomic_fmin_num:
+ case Intrinsic::amdgcn_flat_load_monitor_b128:
+ case Intrinsic::amdgcn_flat_load_monitor_b32:
+ case Intrinsic::amdgcn_flat_load_monitor_b64:
case Intrinsic::amdgcn_global_atomic_csub:
case Intrinsic::amdgcn_global_atomic_fmax_num:
case Intrinsic::amdgcn_global_atomic_fmin_num:
case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
+ case Intrinsic::amdgcn_global_load_monitor_b128:
+ case Intrinsic::amdgcn_global_load_monitor_b32:
+ case Intrinsic::amdgcn_global_load_monitor_b64:
case Intrinsic::amdgcn_global_load_tr_b64:
case Intrinsic::amdgcn_global_load_tr_b128:
case Intrinsic::amdgcn_global_load_tr4_b64:
@@ -4432,19 +4456,28 @@ SDValue SITargetLowering::lowerSET_ROUNDING(SDValue Op,
}
SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const {
- if (Op->isDivergent())
+ if (Op->isDivergent() &&
+ (!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(4)))
+ // Cannot do I$ prefetch with divergent pointer.
return SDValue();
switch (cast<MemSDNode>(Op)->getAddressSpace()) {
case AMDGPUAS::FLAT_ADDRESS:
case AMDGPUAS::GLOBAL_ADDRESS:
case AMDGPUAS::CONSTANT_ADDRESS:
- case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
break;
+ case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
+ if (Subtarget->hasSafeSmemPrefetch())
+ break;
+ [[fallthrough]];
default:
return SDValue();
}
+ // I$ prefetch
+ if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(4))
+ return SDValue();
+
return Op;
}
@@ -5415,6 +5448,19 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineOperand &Src0 = MI.getOperand(1);
MachineOperand &Src1 = MI.getOperand(2);
+ if (ST.hasAddSubU64Insts()) {
+ auto I = BuildMI(*BB, MI, DL,
+ TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
+ : AMDGPU::V_SUB_U64_e64),
+ Dest.getReg())
+ .add(Src0)
+ .add(Src1)
+ .addImm(0); // clamp
+ TII->legalizeOperands(*I);
+ MI.eraseFromParent();
+ return BB;
+ }
+
if (IsAdd && ST.hasLshlAddU64Inst()) {
auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
Dest.getReg())
@@ -14047,7 +14093,8 @@ static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
case ISD::FMAXIMUMNUM:
case AMDGPUISD::FMIN_LEGACY:
case AMDGPUISD::FMAX_LEGACY:
- return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16());
+ return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()) ||
+ (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());
case ISD::FMINIMUM:
case ISD::FMAXIMUM:
return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
@@ -14132,6 +14179,8 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
(VT == MVT::f32 || VT == MVT::f64 ||
(VT == MVT::f16 && Subtarget->has16BitInsts()) ||
+ (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
+ (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
(VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
Op0.hasOneUse()) {
if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 9faf497..dd3f2fe 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -2108,8 +2108,9 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
assert(TII->isFLAT(MI));
- // All flat instructions use the VMEM counter.
- assert(TII->usesVM_CNT(MI));
+ // All flat instructions use the VMEM counter except prefetch.
+ if (!TII->usesVM_CNT(MI))
+ return false;
// If there are no memory operands then conservatively assume the flat
// operation may access VMEM.
@@ -2295,9 +2296,6 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
}
- // A Flat memory operation must access at least one address space.
- assert(FlatASCount);
-
// This is a flat memory operation that access both VMEM and LDS, so note it
// - it will require that both the VM and LGKM be flushed to zero if it is
// pending when a VM or LGKM dependency occurs.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 571f3ef..8d6c1d0 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2508,7 +2508,20 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
.addReg(DstHi);
}
break;
+
+ case AMDGPU::V_MAX_BF16_PSEUDO_e64:
+ assert(ST.hasBF16PackedInsts());
+ MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16));
+ MI.addOperand(MachineOperand::CreateImm(0)); // op_sel
+ MI.addOperand(MachineOperand::CreateImm(0)); // neg_lo
+ MI.addOperand(MachineOperand::CreateImm(0)); // neg_hi
+ auto Op0 = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
+ Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1);
+ auto Op1 = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
+ Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
+ break;
}
+
return true;
}
@@ -2733,49 +2746,47 @@ static MachineInstr *swapImmOperands(MachineInstr &MI,
}
bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
- const MachineOperand *MO0, unsigned OpIdx1,
- const MachineOperand *MO1) const {
+ unsigned OpIdx1) const {
const MCInstrDesc &InstDesc = MI.getDesc();
const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
- const TargetRegisterClass *DefinedRC1 =
- OpInfo1.RegClass != -1 ? RI.getRegClass(OpInfo1.RegClass) : nullptr;
- const TargetRegisterClass *DefinedRC0 =
- OpInfo1.RegClass != -1 ? RI.getRegClass(OpInfo0.RegClass) : nullptr;
unsigned Opc = MI.getOpcode();
int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+ const MachineOperand &MO0 = MI.getOperand(OpIdx0);
+ const MachineOperand &MO1 = MI.getOperand(OpIdx1);
+
// Swap doesn't breach constant bus or literal limits
// It may move literal to position other than src0, this is not allowed
// pre-gfx10 However, most test cases need literals in Src0 for VOP
// FIXME: After gfx9, literal can be in place other than Src0
if (isVALU(MI)) {
- if ((int)OpIdx0 == Src0Idx && !MO0->isReg() &&
- !isInlineConstant(*MO0, OpInfo1))
+ if ((int)OpIdx0 == Src0Idx && !MO0.isReg() &&
+ !isInlineConstant(MO0, OpInfo1))
return false;
- if ((int)OpIdx1 == Src0Idx && !MO1->isReg() &&
- !isInlineConstant(*MO1, OpInfo0))
+ if ((int)OpIdx1 == Src0Idx && !MO1.isReg() &&
+ !isInlineConstant(MO1, OpInfo0))
return false;
}
- if ((int)OpIdx1 != Src0Idx && MO0->isReg()) {
- if (!DefinedRC1)
+ if ((int)OpIdx1 != Src0Idx && MO0.isReg()) {
+ if (OpInfo1.RegClass == -1)
return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
- return isLegalRegOperand(MI, OpIdx1, *MO0) &&
- (!MO1->isReg() || isLegalRegOperand(MI, OpIdx0, *MO1));
+ return isLegalRegOperand(MI, OpIdx1, MO0) &&
+ (!MO1.isReg() || isLegalRegOperand(MI, OpIdx0, MO1));
}
- if ((int)OpIdx0 != Src0Idx && MO1->isReg()) {
- if (!DefinedRC0)
+ if ((int)OpIdx0 != Src0Idx && MO1.isReg()) {
+ if (OpInfo0.RegClass == -1)
return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
- return (!MO0->isReg() || isLegalRegOperand(MI, OpIdx1, *MO0)) &&
- isLegalRegOperand(MI, OpIdx0, *MO1);
+ return (!MO0.isReg() || isLegalRegOperand(MI, OpIdx1, MO0)) &&
+ isLegalRegOperand(MI, OpIdx0, MO1);
}
// No need to check 64-bit literals since swapping does not bring new
// 64-bit literals into current instruction to fold to 32-bit
- return isImmOperandLegal(MI, OpIdx1, *MO0);
+ return isImmOperandLegal(MI, OpIdx1, MO0);
}
MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
@@ -2797,12 +2808,12 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
static_cast<int>(Src1Idx) &&
"inconsistency with findCommutedOpIndices");
- MachineOperand &Src0 = MI.getOperand(Src0Idx);
- MachineOperand &Src1 = MI.getOperand(Src1Idx);
- if (!isLegalToSwap(MI, Src0Idx, &Src0, Src1Idx, &Src1)) {
+ if (!isLegalToSwap(MI, Src0Idx, Src1Idx))
return nullptr;
- }
+
MachineInstr *CommutedMI = nullptr;
+ MachineOperand &Src0 = MI.getOperand(Src0Idx);
+ MachineOperand &Src1 = MI.getOperand(Src1Idx);
if (Src0.isReg() && Src1.isReg()) {
// Be sure to copy the source modifiers to the right place.
CommutedMI =
@@ -7361,6 +7372,10 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
}
case AMDGPU::S_MUL_U64:
+ if (ST.hasVectorMulU64()) {
+ NewOpcode = AMDGPU::V_MUL_U64_e64;
+ break;
+ }
// Split s_mul_u64 in 32-bit vector multiplications.
splitScalarSMulU64(Worklist, Inst, MDT);
Inst.eraseFromParent();
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 800ea9a..2ffb783 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -197,8 +197,7 @@ protected:
AMDGPU::OpName Src0OpName, MachineOperand &Src1,
AMDGPU::OpName Src1OpName) const;
bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx,
- const MachineOperand *fromMO, unsigned toIdx,
- const MachineOperand *toMO) const;
+ unsigned toIdx) const;
MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
unsigned OpIdx0,
unsigned OpIdx1) const override;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index bd4995b..83b0490 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1662,6 +1662,8 @@ def VOP3OpSelMods : ComplexPattern<untyped, 2, "SelectVOP3OpSelMods">;
def VOP3PMadMixModsExt : ComplexPattern<untyped, 2, "SelectVOP3PMadMixModsExt">;
def VOP3PMadMixMods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixMods">;
+def VOP3PMadMixBF16ModsExt : ComplexPattern<untyped, 2, "SelectVOP3PMadMixBF16ModsExt">;
+def VOP3PMadMixBF16Mods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixBF16Mods">;
def VINTERPMods : ComplexPattern<untyped, 2, "SelectVINTERPMods">;
def VINTERPModsHi : ComplexPattern<untyped, 2, "SelectVINTERPModsHi">;
@@ -2863,9 +2865,11 @@ def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>;
def VOP_F16_F16_I32 : VOPProfile <[f16, f16, i32, untyped]>;
def VOP_I16_I16_I16 : VOPProfile <[i16, i16, i16, untyped]>;
def VOP_I16_I16_I16_ARITH : VOPProfile <[i16, i16, i16, untyped], /*EnableClamp=*/1>;
+def VOP_BF16_BF16_BF16 : VOPProfile <[bf16, bf16, bf16, untyped]>;
def VOP_I16_I16_I16_I16 : VOPProfile <[i16, i16, i16, i16, untyped]>;
def VOP_F16_F16_F16_F16 : VOPProfile <[f16, f16, f16, f16, untyped]>;
+def VOP_BF16_BF16_BF16_BF16 : VOPProfile <[bf16, bf16, bf16, bf16, untyped]>;
def VOP_I32_I16_I16_I32 : VOPProfile <[i32, i16, i16, i32, untyped]>;
def VOP_I32_I16 : VOPProfile <[i32, i16, untyped, untyped]>;
@@ -2873,10 +2877,12 @@ def VOP_I16_I32 : VOPProfile <[i16, i32, untyped, untyped]>;
def VOP_V2F16_V2F16_V2F16 : VOPProfile <[v2f16, v2f16, v2f16, untyped]>;
def VOP_V2I16_V2I16_V2I16 : VOPProfile <[v2i16, v2i16, v2i16, untyped]>;
+def VOP_V2BF16_V2BF16_V2BF16 : VOPProfile <[v2bf16, v2bf16, v2bf16, untyped]>;
def VOP_B32_F16_F16 : VOPProfile <[i32, f16, f16, untyped]>;
def VOP_V2F16_V2F16_V2F16_V2F16 : VOPProfile <[v2f16, v2f16, v2f16, v2f16]>;
def VOP_V2I16_V2I16_V2I16_V2I16 : VOPProfile <[v2i16, v2i16, v2i16, v2i16]>;
+def VOP_V2BF16_V2BF16_V2BF16_V2BF16 : VOPProfile <[v2bf16, v2bf16, v2bf16, v2bf16]>;
def VOP_V2I16_F32_F32 : VOPProfile <[v2i16, f32, f32, untyped]>;
def VOP_V2I16_I32_I32 : VOPProfile <[v2i16, i32, i32, untyped]>;
@@ -2912,8 +2918,10 @@ def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>;
def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>;
def VOP_I16_F32_F32 : VOPProfile <[i16, f32, f32, untyped]>;
def VOP_I32_I32_I32_ARITH : VOPProfile <[i32, i32, i32, untyped], /*EnableClamp=*/1>;
+def VOP_I64_I64_I64_ARITH : VOPProfile <[i64, i64, i64, untyped], /*EnableClamp=*/1>;
def VOP_V2F16_F32_F32 : VOPProfile <[v2f16, f32, f32, untyped]>;
def VOP_F32_F16_F16_F16 : VOPProfile <[f32, f16, f16, f16]>;
+def VOP_F32_BF16_BF16_BF16 : VOPProfile <[f32, bf16, bf16, bf16]>;
def VOP_V2BF16_F32_F32 : VOPProfile <[v2bf16, f32, f32, untyped]>;
def VOP_V32F32_V6I32_F32 : VOPProfile <[v32f32, v6i32, f32, untyped]>;
def VOP_V32F16_V6I32_F32 : VOPProfile <[v32f16, v6i32, f32, untyped]>;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index d05be8f..54fa192 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1894,6 +1894,9 @@ let SubtargetPredicate = UseRealTrue16Insts in
def : ClampPat<V_MAX_F16_t16_e64, f16>;
let SubtargetPredicate = UseFakeTrue16Insts in
def : ClampPat<V_MAX_F16_fake16_e64, f16>;
+// FIXME-TRUE16: Pseudo expansion of this won't work with True16.
+let True16Predicate = UseFakeTrue16Insts in
+def : ClampPat<V_MAX_BF16_PSEUDO_e64, bf16>;
let SubtargetPredicate = HasVOP3PInsts in {
def : GCNPat <
@@ -1903,6 +1906,13 @@ def : GCNPat <
>;
}
+let SubtargetPredicate = HasBF16PackedInsts in {
+def : GCNPat <
+ (v2bf16 (AMDGPUclamp (VOP3PMods v2bf16:$src0, i32:$src0_modifiers))),
+ (V_PK_MAX_NUM_BF16 $src0_modifiers, $src0,
+ $src0_modifiers, $src0, DSTCLAMP.ENABLE)
+>;
+} // End SubtargetPredicate = HasBF16PackedInsts
/********** ================================ **********/
/********** Floating point absolute/negative **********/
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index f0be204..9a1448f 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -81,11 +81,15 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
}
- MayNeedAGPRs = ST.hasMAIInsts() && !MFMAVGPRForm;
- if (!MFMAVGPRForm && ST.hasGFX90AInsts() &&
- ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() &&
- !mayUseAGPRs(F))
- MayNeedAGPRs = false; // We will select all MAI with VGPR operands.
+ MayNeedAGPRs = ST.hasMAIInsts();
+ if (ST.hasGFX90AInsts()) {
+ // FIXME: MayNeedAGPRs is a misnomer for how this is used. MFMA selection
+ // should be separated from availability of AGPRs
+ if (MFMAVGPRForm ||
+ (ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() &&
+ !mayUseAGPRs(F)))
+ MayNeedAGPRs = false; // We will select all MAI with VGPR operands.
+ }
if (AMDGPU::isChainCC(CC)) {
// Chain functions don't receive an SP from their caller, but are free to
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 3212060..0e8a420 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -704,16 +704,16 @@ void diagnoseUnknownMMRAASName(const MachineInstr &MI, StringRef AS) {
DiagnosticInfoUnsupported(Fn, Str.str(), MI.getDebugLoc(), DS_Warning));
}
-/// Reads \p MI's MMRAs to parse the "amdgpu-as" MMRA.
-/// If this tag isn't present, or if it has no meaningful values, returns \p
-/// Default. Otherwise returns all the address spaces concerned by the MMRA.
-static SIAtomicAddrSpace getFenceAddrSpaceMMRA(const MachineInstr &MI,
- SIAtomicAddrSpace Default) {
- static constexpr StringLiteral FenceASPrefix = "amdgpu-as";
+/// Reads \p MI's MMRAs to parse the "amdgpu-synchronize-as" MMRA.
+/// If this tag isn't present, or if it has no meaningful values, returns
+/// \p none, otherwise returns the address spaces specified by the MD.
+static std::optional<SIAtomicAddrSpace>
+getSynchronizeAddrSpaceMD(const MachineInstr &MI) {
+ static constexpr StringLiteral FenceASPrefix = "amdgpu-synchronize-as";
auto MMRA = MMRAMetadata(MI.getMMRAMetadata());
if (!MMRA)
- return Default;
+ return std::nullopt;
SIAtomicAddrSpace Result = SIAtomicAddrSpace::NONE;
for (const auto &[Prefix, Suffix] : MMRA) {
@@ -726,7 +726,10 @@ static SIAtomicAddrSpace getFenceAddrSpaceMMRA(const MachineInstr &MI,
diagnoseUnknownMMRAASName(MI, Suffix);
}
- return (Result != SIAtomicAddrSpace::NONE) ? Result : Default;
+ if (Result == SIAtomicAddrSpace::NONE)
+ return std::nullopt;
+
+ return Result;
}
} // end anonymous namespace
@@ -903,12 +906,19 @@ SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
*ScopeOrNone;
- if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
- ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
+ if (OrderingAddrSpace != SIAtomicAddrSpace::ATOMIC) {
+ // We currently expect refineOrderingAS to be the only place that
+ // can refine the AS ordered by the fence.
+ // If that changes, we need to review the semantics of that function
+ // in case it needs to preserve certain address spaces.
reportUnsupported(MI, "Unsupported atomic address space");
return std::nullopt;
}
+ auto SynchronizeAS = getSynchronizeAddrSpaceMD(*MI);
+ if (SynchronizeAS)
+ OrderingAddrSpace = *SynchronizeAS;
+
return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
}
@@ -2687,11 +2697,7 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
AtomicPseudoMIs.push_back(MI);
bool Changed = false;
- // Refine fenced address space based on MMRAs.
- //
- // TODO: Should we support this MMRA on other atomic operations?
- auto OrderingAddrSpace =
- getFenceAddrSpaceMMRA(*MI, MOI.getOrderingAddrSpace());
+ const SIAtomicAddrSpace OrderingAddrSpace = MOI.getOrderingAddrSpace();
if (MOI.isAtomic()) {
const AtomicOrdering Order = MOI.getOrdering();
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 0039d2f..218841d 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -109,6 +109,23 @@ class SIRegisterClass <string n, list<ValueType> rTypes, int Align, dag rList>
let TSFlags{2} = HasVGPR;
let TSFlags{3} = HasAGPR;
let TSFlags{4} = HasSGPR;
+
+ // RA will use RegisterClass AllocationPriority amongst other info (e.g. ordering in the basic block)
+ // to decide which registers to try to assign first. Usually, this RegisterClass priority is given
+ // very high priority, if not the highest priority, when considering which VirtReg to allocate next.
+ //
+ // We have 5 bits to assign AllocationPriorities to RegisterClasses. Generally, it is beneficial to
+ // assign more constrained RegisterClasses first. As a result, we prioritize register classes with
+ // more 32 bit tuples (e.g. VReg_512) over registers with fewer tuples (e.g. VGPR_32).
+ //
+ // The interesting case is the vector register case on architectures which have ARegs, VRegs, AVRegs.
+ // In this case, we would like to assign ARegs and VRegs before AVRegs, as AVRegs are less constrained
+ // and can be assigned to both AGPRs and VGPRs. We use the 5th bit to encode this into the
+ // RegisterClass AllocationPriority. BaseClassPriority is used to turn the bit on, and BaseClassScaleFactor
+ // is used for scaling of the bit (i.e. 1 << 4).
+ field int BaseClassPriority = 1;
+ field int BaseClassScaleFactor = 16;
+
}
multiclass SIRegLoHi16 <string n, bits<8> regIdx, bit ArtificialHigh = 1,
@@ -575,7 +592,7 @@ let HasVGPR = 1 in {
def VGPR_16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
(add (interleave (sequence "VGPR%u_LO16", 0, 255),
(sequence "VGPR%u_HI16", 0, 255)))> {
- let AllocationPriority = 2;
+ let AllocationPriority = !add(2, !mul(BaseClassPriority, BaseClassScaleFactor));
let Size = 16;
let GeneratePressureSet = 0;
@@ -601,7 +618,7 @@ def VGPR_16_Lo128 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
// i16/f16 only on VI+
def VGPR_32 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32,
(add (sequence "VGPR%u", 0, 255))> {
- let AllocationPriority = 0;
+ let AllocationPriority = !add(0, !mul(BaseClassPriority, BaseClassScaleFactor));
let Size = 32;
let Weight = 1;
let BaseClassOrder = 32;
@@ -610,7 +627,7 @@ def VGPR_32 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types
// Identical to VGPR_32 except it only contains the low 128 (Lo128) registers.
def VGPR_32_Lo128 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32,
(add (sequence "VGPR%u", 0, 127))> {
- let AllocationPriority = 0;
+ let AllocationPriority = !add(0, !mul(BaseClassPriority, BaseClassScaleFactor));
let GeneratePressureSet = 0;
let Size = 32;
let Weight = 1;
@@ -668,7 +685,7 @@ def AGPR_LO16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
// AccVGPR 32-bit registers
def AGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
(add (sequence "AGPR%u", 0, 255))> {
- let AllocationPriority = 0;
+ let AllocationPriority = !add(0, !mul(BaseClassPriority, BaseClassScaleFactor));
let Size = 32;
let Weight = 1;
let BaseClassOrder = 32;
@@ -940,14 +957,23 @@ class VRegClassBase<int numRegs, list<ValueType> regTypes, dag regList> :
// Requires n v_mov_b32 to copy
let CopyCost = numRegs;
- let AllocationPriority = !sub(numRegs, 1);
+
+ // Since we only have 5 bits for the RegisterClass Allocation Priorty, and since we use the
+ // 5th bit for BaseClassPriority, we need to encode the SizePriority into 4 bits. As a result
+ // of this encoding, for registers with numRegs 15 or 16, we give SizePriority of 14, and for
+ // regsters with numRegs 17+ we give SizePriority of 15. In practice, there is only one
+ // RegClass per Vector Register type in each of these groups (i.e. numRegs = 15,16 : {VReg_512},
+ // and numRegs = 17+ : {VReg_1024}). Therefore, we have not lost any info by compressing.
+ defvar SizePrioriity = !if(!le(numRegs, 14), !sub(numRegs, 1), !if(!le(numRegs, 16), 14, 15));
+
+ let AllocationPriority = !add(SizePrioriity, !mul(BaseClassPriority, BaseClassScaleFactor));
let Weight = numRegs;
}
// Define a register tuple class, along with one requiring an even
// aligned base register.
multiclass VRegClass<int numRegs, list<ValueType> regTypes, dag regList> {
- let HasVGPR = 1 in {
+ let HasVGPR = 1, BaseClassPriority = 1 in {
// Define the regular class.
def "" : VRegClassBase<numRegs, regTypes, regList> {
let BaseClassOrder = !mul(numRegs, 32);
@@ -981,7 +1007,7 @@ defm VReg_1024 : VRegClass<32, Reg1024Types.types, (add VGPR_1024)>;
}
multiclass ARegClass<int numRegs, list<ValueType> regTypes, dag regList> {
- let CopyCost = !add(numRegs, numRegs, 1), HasAGPR = 1 in {
+ let CopyCost = !add(numRegs, numRegs, 1), HasAGPR = 1, BaseClassPriority = 1 in {
// Define the regular class.
def "" : VRegClassBase<numRegs, regTypes, regList> {
let BaseClassOrder = !mul(numRegs, 32);
@@ -1066,6 +1092,7 @@ def VS_64 : SIRegisterClass<"AMDGPU", VReg_64.RegTypes, 32, (add VReg_64, SReg_6
def AV_32 : SIRegisterClass<"AMDGPU", VGPR_32.RegTypes, 32, (add VGPR_32, AGPR_32)> {
let HasVGPR = 1;
let HasAGPR = 1;
+ let BaseClassPriority = 0;
let Size = 32;
}
} // End GeneratePressureSet = 0
@@ -1074,7 +1101,7 @@ def AV_32 : SIRegisterClass<"AMDGPU", VGPR_32.RegTypes, 32, (add VGPR_32, AGPR_3
// aligned base register.
multiclass AVRegClass<int numRegs, list<ValueType> regTypes,
dag vregList, dag aregList> {
- let HasVGPR = 1, HasAGPR = 1 in {
+ let HasVGPR = 1, HasAGPR = 1, BaseClassPriority = 0 in {
// Define the regular class.
def "" : VRegClassBase<numRegs, regTypes, (add vregList, aregList)>;
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index 38cc51b..4bda51d 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -856,9 +856,9 @@ def smrd_sextloadi16 : SMRDLoadPat<sextloadi16>;
def smrd_prefetch : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type),
(prefetch node:$ptr, node:$rw, node:$loc, node:$type),
- [{ return !N->getOperand(1)->isDivergent();}]> {
+ [{ return !N->getOperand(1)->isDivergent() && Subtarget->hasSafeSmemPrefetch();}]> {
let GISelPredicateCode = [{
- return isInstrUniform(MI);
+ return isInstrUniform(MI) && Subtarget->hasSafeSmemPrefetch();
}];
}
@@ -1152,6 +1152,7 @@ multiclass SMPrefetchPat<string type, TImmLeaf cache_type> {
}
defm : SMPrefetchPat<"INST", i32imm_zero>;
+let AddedComplexity = 12 in // Prefer scalar prefetch over global for r/o case.
defm : SMPrefetchPat<"DATA", i32imm_one>;
let SubtargetPredicate = isGFX12Plus in {
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 030a6e1..550ec9d 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -925,6 +925,17 @@ let isAdd = 1 in {
defm V_ADDC_U32 : VOP2bInst <"v_addc_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_addc_u32">;
}
+let isReMaterializable = 1 in {
+let SubtargetPredicate = HasAddSubU64Insts, SchedRW = [Write64Bit] in {
+defm V_ADD_U64 : VOP2Inst <"v_add_nc_u64", VOP_I64_I64_I64_ARITH>;
+// We don't actually have something like V_SUBREV_U64 so V_SUB_U64 can't be treated as commutable.
+let isCommutable = 0 in
+defm V_SUB_U64 : VOP2Inst <"v_sub_nc_u64", VOP_I64_I64_I64_ARITH>;
+} // End SubtargetPredicate = HasAddSubU64Insts, SchedRW = [Write64Bit]
+let SubtargetPredicate = isGFX1250Plus, SchedRW = [WriteDouble] in
+defm V_MUL_U64 : VOP2Inst <"v_mul_u64", VOP_I64_I64_I64, DivergentBinFrag<mul>>;
+} // End isReMaterializable = 1
+
} // End isCommutable = 1
// These are special and do not read the exec mask.
@@ -1754,6 +1765,9 @@ multiclass VOP2_Real_FULL_with_name<GFXGen Gen, bits<6> op, string opName,
VOP2_Realtriple_e64_with_name<Gen, op, opName, asmName>,
VOP2_Real_NO_VOP3_with_name<Gen, op, opName, asmName>;
+multiclass VOP2_Real_NO_DPP<GFXGen Gen, bits<6> op> :
+ VOP2_Real_e32<Gen, op>, VOP2_Real_e64<Gen, op>;
+
multiclass VOP2_Real_NO_DPP_with_name<GFXGen Gen, bits<6> op, string opName,
string asmName> {
defm NAME : VOP2_Real_e32_with_name<Gen, op, opName, asmName>,
@@ -1843,6 +1857,9 @@ defm V_FMAC_F64 : VOP2_Real_FULL<GFX12Gen, 0x17>;
defm V_FMAMK_F64 : VOP2Only_Real_MADK64<GFX1250Gen, 0x23>;
defm V_FMAAK_F64 : VOP2Only_Real_MADK64<GFX1250Gen, 0x24>;
+defm V_ADD_U64 : VOP2_Real_FULL<GFX1250Gen, 0x28>;
+defm V_SUB_U64 : VOP2_Real_FULL<GFX1250Gen, 0x29>;
+defm V_MUL_U64 : VOP2_Real_NO_DPP<GFX1250Gen, 0x2a>;
//===----------------------------------------------------------------------===//
// GFX11.
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index a259e5c..95fcd4a 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -35,14 +35,18 @@ class VOP3P_Mix_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR,
bit useTiedOutput = 0> : VOP3P_Profile<P, Features, 1> {
bit UseTiedOutput = useTiedOutput;
+ defvar Src0RC = getVCSrcForVT<P.Src0VT>.ret;
+ defvar Src1RC = getVCSrcForVT<P.Src1VT>.ret;
+ defvar Src2RC = getVCSrcForVT<P.Src2VT>.ret;
+
dag srcs =
- (ins FP16InputMods:$src0_modifiers, VCSrc_f16:$src0,
- FP16InputMods:$src1_modifiers, VCSrc_f16:$src1,
- FP16InputMods:$src2_modifiers, VCSrc_f16:$src2);
+ (ins FP16InputMods:$src0_modifiers, Src0RC:$src0,
+ FP16InputMods:$src1_modifiers, Src1RC:$src1,
+ FP16InputMods:$src2_modifiers, Src2RC:$src2);
dag dpp_srcs =
(ins FPVRegInputMods:$src0_modifiers, VGPRSrc_32:$src0,
FPVRegInputMods:$src1_modifiers, VRegSrc_32:$src1,
- FP16InputMods:$src2_modifiers, VCSrc_f16:$src2);
+ FP16InputMods:$src2_modifiers, Src2RC:$src2);
// FIXME: Clamp0 misbehaves with the non-default vdst_in
// following it. For now workaround this by requiring clamp
@@ -144,48 +148,59 @@ def : VOP3PSatPat<usubsat, V_PK_SUB_U16>;
def : VOP3PSatPat<ssubsat, V_PK_SUB_I16>;
} // End SubtargetPredicate = HasVOP3PInsts
-let SubtargetPredicate = HasMinimum3Maximum3PKF16, FPDPRounding = 1 in {
+let isCommutable = 1, FPDPRounding = 1 in {
+let SubtargetPredicate = HasMin3Max3PKF16 in {
+defm V_PK_MIN3_NUM_F16 : VOP3PInst<"v_pk_min3_num_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, AMDGPUfmin3>;
+defm V_PK_MAX3_NUM_F16 : VOP3PInst<"v_pk_max3_num_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, AMDGPUfmax3>;
+}
+
+let SubtargetPredicate = HasMinimum3Maximum3PKF16 in {
defm V_PK_MINIMUM3_F16 : VOP3PInst<"v_pk_minimum3_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, AMDGPUfminimum3>;
defm V_PK_MAXIMUM3_F16 : VOP3PInst<"v_pk_maximum3_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, AMDGPUfmaximum3>;
}
+} // End isCommutable = 1, FPDPRounding = 1
// TODO: Make sure we're doing the right thing with denormals. Note
// that FMA and MAD will differ.
multiclass MadFmaMixPats<SDPatternOperator fma_like,
Instruction mix_inst,
Instruction mixlo_inst,
- Instruction mixhi_inst> {
+ Instruction mixhi_inst,
+ ValueType VT = f16,
+ ValueType vecVT = v2f16> {
+ defvar VOP3PMadMixModsPat = !if (!eq(VT, bf16), VOP3PMadMixBF16Mods, VOP3PMadMixMods);
+ defvar VOP3PMadMixModsExtPat = !if (!eq(VT, bf16), VOP3PMadMixBF16ModsExt, VOP3PMadMixModsExt);
// At least one of the operands needs to be an fpextend of an f16
// for this to be worthwhile, so we need three patterns here.
// TODO: Could we use a predicate to inspect src1/2/3 instead?
def : GCNPat <
- (f32 (fma_like (f32 (VOP3PMadMixModsExt f16:$src0, i32:$src0_mods)),
- (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_mods)),
- (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_mods)))),
+ (f32 (fma_like (f32 (VOP3PMadMixModsExtPat VT:$src0, i32:$src0_mods)),
+ (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_mods)),
+ (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_mods)))),
(mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2,
DSTCLAMP.NONE)>;
def : GCNPat <
- (f32 (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_mods)),
- (f32 (VOP3PMadMixModsExt f16:$src1, i32:$src1_mods)),
- (f32 (VOP3PMadMixMods f32:$src2, i32:$src2_mods)))),
+ (f32 (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_mods)),
+ (f32 (VOP3PMadMixModsExtPat VT:$src1, i32:$src1_mods)),
+ (f32 (VOP3PMadMixModsPat f32:$src2, i32:$src2_mods)))),
(mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2,
DSTCLAMP.NONE)>;
def : GCNPat <
- (f32 (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_mods)),
- (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_mods)),
- (f32 (VOP3PMadMixModsExt f16:$src2, i32:$src2_mods)))),
+ (f32 (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_mods)),
+ (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_mods)),
+ (f32 (VOP3PMadMixModsExtPat VT:$src2, i32:$src2_mods)))),
(mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2,
DSTCLAMP.NONE)>;
def : GCNPat <
(AMDGPUclamp (build_vector
- (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers))))),
- (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers))))))),
- (v2f16 (mixhi_inst $hi_src0_modifiers, $hi_src0,
+ (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$lo_src0, i32:$lo_src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$lo_src1, i32:$lo_src1_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$lo_src2, i32:$lo_src2_modifiers))))),
+ (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$hi_src0, i32:$hi_src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$hi_src1, i32:$hi_src1_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$hi_src2, i32:$hi_src2_modifiers))))))),
+ (vecVT (mixhi_inst $hi_src0_modifiers, $hi_src0,
$hi_src1_modifiers, $hi_src1,
$hi_src2_modifiers, $hi_src2,
DSTCLAMP.ENABLE,
@@ -197,8 +212,8 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
>;
def : GCNPat <
- (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers))))),
+ (VT (fpround (fmul (f32 (VOP3PMadMixModsPat f32:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_modifiers))))),
(mixlo_inst $src0_modifiers, $src0,
$src1_modifiers, $src1,
(i32 0), (i32 0),
@@ -207,9 +222,9 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
>;
def : GCNPat <
- (build_vector f16:$elt0, (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers)))))),
- (v2f16 (mixhi_inst $src0_modifiers, $src0,
+ (build_vector VT:$elt0, (VT (fpround (fmul (f32 (VOP3PMadMixModsPat f32:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_modifiers)))))),
+ (vecVT (mixhi_inst $src0_modifiers, $src0,
$src1_modifiers, $src1,
(i32 0), (i32 0),
DSTCLAMP.NONE,
@@ -217,9 +232,9 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
>;
def : GCNPat <
- (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))),
+ (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))),
(mixlo_inst $src0_modifiers, $src0,
$src1_modifiers, $src1,
$src2_modifiers, $src2,
@@ -234,10 +249,10 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
let True16Predicate = p in {
def : GCNPat <
- (build_vector f16:$elt0, (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))),
- (v2f16 (mixhi_inst $src0_modifiers, $src0,
+ (build_vector VT:$elt0, (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers)))))),
+ (vecVT (mixhi_inst $src0_modifiers, $src0,
$src1_modifiers, $src1,
$src2_modifiers, $src2,
DSTCLAMP.NONE,
@@ -246,11 +261,11 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
def : GCNPat <
(build_vector
- f16:$elt0,
- (AMDGPUclamp (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))))),
- (v2f16 (mixhi_inst $src0_modifiers, $src0,
+ VT:$elt0,
+ (AMDGPUclamp (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))))),
+ (vecVT (mixhi_inst $src0_modifiers, $src0,
$src1_modifiers, $src1,
$src2_modifiers, $src2,
DSTCLAMP.ENABLE,
@@ -261,38 +276,38 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
let True16Predicate = UseRealTrue16Insts in {
def : GCNPat <
- (build_vector (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))), f16:$elt1),
- (v2f16 (mixlo_inst $src0_modifiers, $src0,
+ (build_vector (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))), VT:$elt1),
+ (vecVT (mixlo_inst $src0_modifiers, $src0,
$src1_modifiers, $src1,
$src2_modifiers, $src2,
DSTCLAMP.NONE,
- (REG_SEQUENCE VGPR_32, (f16 (IMPLICIT_DEF)), lo16, $elt1, hi16)))
+ (REG_SEQUENCE VGPR_32, (VT (IMPLICIT_DEF)), lo16, $elt1, hi16)))
>;
def : GCNPat <
- (build_vector f16:$elt0, (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))),
- (v2f16 (mixhi_inst $src0_modifiers, $src0,
+ (build_vector VT:$elt0, (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers)))))),
+ (vecVT (mixhi_inst $src0_modifiers, $src0,
$src1_modifiers, $src1,
$src2_modifiers, $src2,
DSTCLAMP.NONE,
- (REG_SEQUENCE VGPR_32, $elt0, lo16, (f16 (IMPLICIT_DEF)), hi16)))
+ (REG_SEQUENCE VGPR_32, $elt0, lo16, (VT (IMPLICIT_DEF)), hi16)))
>;
def : GCNPat <
(build_vector
- f16:$elt0,
- (AMDGPUclamp (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))))),
- (v2f16 (mixhi_inst $src0_modifiers, $src0,
+ VT:$elt0,
+ (AMDGPUclamp (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))))),
+ (vecVT (mixhi_inst $src0_modifiers, $src0,
$src1_modifiers, $src1,
$src2_modifiers, $src2,
DSTCLAMP.ENABLE,
- (REG_SEQUENCE VGPR_32, $elt0, lo16, (f16 (IMPLICIT_DEF)), hi16)))
+ (REG_SEQUENCE VGPR_32, $elt0, lo16, (VT (IMPLICIT_DEF)), hi16)))
>;
} // end True16Predicate
}
@@ -353,6 +368,24 @@ defm V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3P_Mix_Profile<VOP_F
defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>;
}
+let SubtargetPredicate = HasFmaMixBF16Insts in {
+let isCommutable = 1 in {
+
+let isReMaterializable = 1 in
+defm V_FMA_MIX_F32_BF16 : VOP3_VOP3PInst<"v_fma_mix_f32_bf16", VOP3P_Mix_Profile<VOP_F32_BF16_BF16_BF16, VOP3_OPSEL>>;
+
+let FPDPRounding = 1 in {
+defm V_FMA_MIXLO_BF16 : VOP3_VOP3PInst<"v_fma_mixlo_bf16", VOP3P_Mix_Profile<VOP_BF16_BF16_BF16_BF16, VOP3_OPSEL, 1>>;
+
+let ClampLo = 0, ClampHi = 1 in {
+defm V_FMA_MIXHI_BF16 : VOP3_VOP3PInst<"v_fma_mixhi_bf16", VOP3P_Mix_Profile<VOP_BF16_BF16_BF16_BF16, VOP3_OPSEL, 1>>;
+}
+} // End FPDPRounding = 1
+} // End isCommutable = 1
+
+defm : MadFmaMixPats<fma, V_FMA_MIX_F32_BF16, V_FMA_MIXLO_BF16, V_FMA_MIXHI_BF16, bf16, v2bf16>;
+} // End SubtargetPredicate = HasFmaMixBF16Insts
+
def PK_ADD_MINMAX_Profile : VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16, VOP3_PACKED> {
let HasModifiers = 0;
}
@@ -1196,6 +1229,20 @@ let isCommutable = 1, isReMaterializable = 1 in {
let SubtargetPredicate = HasPkMovB32, isAsCheapAsAMove = 1 in
defm V_PK_MOV_B32 : VOP3PInst<"v_pk_mov_b32", VOP3P_Profile<VOP_V2I32_V2I32_V2I32, VOP3_PACKED>>;
+
+ let SubtargetPredicate = HasBF16PackedInsts in {
+ defm V_PK_ADD_BF16 : VOP3PInst<"v_pk_add_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, any_fadd>;
+ defm V_PK_MUL_BF16 : VOP3PInst<"v_pk_mul_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, any_fmul>;
+ defm V_PK_MIN_NUM_BF16 : VOP3PInst<"v_pk_min_num_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, fminnum_like>;
+ defm V_PK_MAX_NUM_BF16 : VOP3PInst<"v_pk_max_num_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, fmaxnum_like>;
+ defm V_PK_FMA_BF16 : VOP3PInst<"v_pk_fma_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, any_fma>;
+
+ // Scalar pseudo used to emulate AMDGPUClamp.
+ // Expanded to V_PK_MAX_NUM_BF16 with unused high half.
+ // FIXME-TRUE16: Pseudo expansion of this won't work with True16.
+ let True16Predicate = UseFakeTrue16Insts in
+ defm V_MAX_BF16_PSEUDO : VOP3Inst <"v_max_bf16", VOP_BF16_BF16_BF16>;
+ }
} // End isCommutable = 1, isReMaterializable = 1
def : AMDGPUMnemonicAlias<"v_accvgpr_read", "v_accvgpr_read_b32">;
@@ -2210,6 +2257,10 @@ multiclass VOP3P_Real_with_name_gfx12<bits<8> op,
defm V_PK_MIN_NUM_F16 : VOP3P_Real_with_name_gfx12<0x1b, "V_PK_MIN_F16", "v_pk_min_num_f16">;
defm V_PK_MAX_NUM_F16 : VOP3P_Real_with_name_gfx12<0x1c, "V_PK_MAX_F16", "v_pk_max_num_f16">;
+defm V_PK_FMA_F32 : VOP3P_Real_gfx12<0x1f>;
+defm V_PK_MUL_F32 : VOP3P_Real_gfx12<0x28>;
+defm V_PK_ADD_F32 : VOP3P_Real_gfx12<0x29>;
+
defm V_PK_ADD_MAX_I16 : VOP3P_Real_gfx1250<0x14>;
defm V_PK_ADD_MAX_U16 : VOP3P_Real_gfx1250<0x15>;
defm V_PK_ADD_MIN_I16 : VOP3P_Real_gfx1250<0x2d>;
@@ -2218,6 +2269,22 @@ defm V_PK_MAX3_I16 : VOP3P_Real_gfx1250<0x2f>;
defm V_PK_MAX3_U16 : VOP3P_Real_gfx1250<0x30>;
defm V_PK_MIN3_I16 : VOP3P_Real_gfx1250<0x31>;
defm V_PK_MIN3_U16 : VOP3P_Real_gfx1250<0x32>;
+defm V_PK_FMA_BF16 : VOP3P_Real_gfx1250<0x11>;
+defm V_PK_ADD_BF16 : VOP3P_Real_gfx1250<0x23>;
+defm V_PK_MUL_BF16 : VOP3P_Real_gfx1250<0x2a>;
+defm V_PK_MIN_NUM_BF16 : VOP3P_Real_gfx1250<0x2b>;
+defm V_PK_MAX_NUM_BF16 : VOP3P_Real_gfx1250<0x2c>;
+defm V_PK_MINIMUM3_F16 : VOP3P_Real_gfx1250<0x36>;
+defm V_PK_MAXIMUM3_F16 : VOP3P_Real_gfx1250<0x37>;
+defm V_PK_MIN3_NUM_F16 : VOP3P_Real_gfx1250<0x38>;
+defm V_PK_MAX3_NUM_F16 : VOP3P_Real_gfx1250<0x39>;
+
+defm V_FMA_MIX_F32_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3d>;
+defm V_FMA_MIXLO_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3e>;
+defm V_FMA_MIXHI_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3f>;
+
+let AssemblerPredicate = isGFX1250Plus in
+def : AMDGPUMnemonicAlias<"v_fma_mix_f32_f16", "v_fma_mix_f32">;
defm V_PK_MINIMUM_F16 : VOP3P_Real_gfx12<0x1d>;
defm V_PK_MAXIMUM_F16 : VOP3P_Real_gfx12<0x1e>;
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 8b7f06a..fca5dff 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -20347,6 +20347,13 @@ ARMTargetLowering::getSingleConstraintMatchWeight(
return weight;
}
+static bool isIncompatibleReg(const MCPhysReg &PR, MVT VT) {
+ if (PR == 0 || VT == MVT::Other)
+ return false;
+ return (ARM::SPRRegClass.contains(PR) && VT != MVT::f32 && VT != MVT::i32) ||
+ (ARM::DPRRegClass.contains(PR) && VT != MVT::f64);
+}
+
using RCPair = std::pair<unsigned, const TargetRegisterClass *>;
RCPair ARMTargetLowering::getRegForInlineAsmConstraint(
@@ -20420,7 +20427,10 @@ RCPair ARMTargetLowering::getRegForInlineAsmConstraint(
if (StringRef("{cc}").equals_insensitive(Constraint))
return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
- return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+ auto RCP = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+ if (isIncompatibleReg(RCP.first, VT))
+ return {0, nullptr};
+ return RCP;
}
/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index ec6f4e2..ece6c10 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -12327,7 +12327,7 @@ bool ARMAsmParser::parseDirectiveEven(SMLoc L) {
}
assert(Section && "must have section to emit alignment");
- if (Section->useCodeAlign())
+ if (getContext().getAsmInfo()->useCodeAlign(*Section))
getStreamer().emitCodeAlignment(Align(2), &getSTI());
else
getStreamer().emitValueToAlignment(Align(2));
@@ -12525,7 +12525,7 @@ bool ARMAsmParser::parseDirectiveAlign(SMLoc L) {
// '.align' is target specifically handled to mean 2**2 byte alignment.
const MCSection *Section = getStreamer().getCurrentSectionOnly();
assert(Section && "must have section to emit alignment");
- if (Section->useCodeAlign())
+ if (getContext().getAsmInfo()->useCodeAlign(*Section))
getStreamer().emitCodeAlignment(Align(4), &getSTI(), 0);
else
getStreamer().emitValueToAlignment(Align(4), 0, 1, 0);
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index a7a9911..868556b 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -708,8 +708,6 @@ private:
void SwitchToExTabSection(const MCSymbol &FnStart);
void SwitchToExIdxSection(const MCSymbol &FnStart);
- void EmitFixup(const MCExpr *Expr, MCFixupKind Kind);
-
bool IsThumb;
bool IsAndroid;
@@ -1096,8 +1094,8 @@ void ARMTargetELFStreamer::emitLabel(MCSymbol *Symbol) {
}
void ARMTargetELFStreamer::annotateTLSDescriptorSequence(
- const MCSymbolRefExpr *S) {
- getStreamer().EmitFixup(S, FK_Data_4);
+ const MCSymbolRefExpr *Expr) {
+ getStreamer().addFixup(Expr, FK_Data_4);
}
void ARMTargetELFStreamer::emitCode16() { getStreamer().setIsThumb(true); }
@@ -1140,7 +1138,8 @@ void ARMTargetELFStreamer::finish() {
MCContext &Ctx = getContext();
auto &Asm = getStreamer().getAssembler();
if (any_of(Asm, [](const MCSection &Sec) {
- return cast<MCSectionELF>(Sec).getFlags() & ELF::SHF_ARM_PURECODE;
+ return static_cast<const MCSectionELF &>(Sec).getFlags() &
+ ELF::SHF_ARM_PURECODE;
})) {
auto *Text =
static_cast<MCSectionELF *>(Ctx.getObjectFileInfo()->getTextSection());
@@ -1206,11 +1205,6 @@ inline void ARMELFStreamer::SwitchToExIdxSection(const MCSymbol &FnStart) {
SectionKind::getData(), FnStart);
}
-void ARMELFStreamer::EmitFixup(const MCExpr *Expr, MCFixupKind Kind) {
- MCFragment *Frag = getCurrentFragment();
- Frag->addFixup(MCFixup::create(Frag->getContents().size(), Expr, Kind));
-}
-
void ARMELFStreamer::EHReset() {
ExTab = nullptr;
FnStart = nullptr;
diff --git a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
index ad8aa571..0fb33cd 100644
--- a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
+++ b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
@@ -260,7 +260,7 @@ bool AVRAsmPrinter::doFinalization(Module &M) {
continue;
}
- auto *Section = cast<MCSectionELF>(TLOF.SectionForGlobal(&GO, TM));
+ auto *Section = static_cast<MCSectionELF *>(TLOF.SectionForGlobal(&GO, TM));
if (Section->getName().starts_with(".data"))
NeedsCopyData = true;
else if (Section->getName().starts_with(".rodata") && SubTM->hasLPM())
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp
index c2c1bb4..0615ec9 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp
@@ -24,8 +24,6 @@ AVRMCAsmInfo::AVRMCAsmInfo(const Triple &TT, const MCTargetOptions &Options) {
CalleeSaveStackSlotSize = 2;
CommentString = ";";
SeparatorString = "$";
- PrivateGlobalPrefix = ".L";
- PrivateLabelPrefix = ".L";
UsesELFSectionDirectiveForBSS = true;
SupportsDebugInformation = true;
}
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h
index fab2713..1915fa8 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h
@@ -14,7 +14,7 @@
#define LLVM_AVR_ASM_INFO_H
#include "MCTargetDesc/AVRMCExpr.h"
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmInfoELF.h"
#include "llvm/MC/MCExpr.h"
namespace llvm {
@@ -22,7 +22,7 @@ namespace llvm {
class Triple;
/// Specifies the format of AVR assembly files.
-class AVRMCAsmInfo : public MCAsmInfo {
+class AVRMCAsmInfo : public MCAsmInfoELF {
public:
explicit AVRMCAsmInfo(const Triple &TT, const MCTargetOptions &Options);
void printSpecifierExpr(raw_ostream &OS,
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
index 5963976..6ec78d0 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
@@ -7,12 +7,10 @@
//===----------------------------------------------------------------------===//
#include "AVRMCExpr.h"
-#include "MCTargetDesc/AVRMCAsmInfo.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCAssembler.h"
#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCValue.h"
namespace llvm {
diff --git a/llvm/lib/Target/BPF/BTFDebug.cpp b/llvm/lib/Target/BPF/BTFDebug.cpp
index 1e29a0f..a87b9a2 100644
--- a/llvm/lib/Target/BPF/BTFDebug.cpp
+++ b/llvm/lib/Target/BPF/BTFDebug.cpp
@@ -1255,10 +1255,8 @@ void BTFDebug::beginFunctionImpl(const MachineFunction *MF) {
FuncInfo.Label = FuncLabel;
FuncInfo.TypeId = FuncTypeId;
if (FuncLabel->isInSection()) {
- MCSection &Section = FuncLabel->getSection();
- const MCSectionELF *SectionELF = dyn_cast<MCSectionELF>(&Section);
- assert(SectionELF && "Null section for Function Label");
- SecNameOff = addString(SectionELF->getName());
+ auto &Sec = static_cast<const MCSectionELF &>(FuncLabel->getSection());
+ SecNameOff = addString(Sec.getName());
} else {
SecNameOff = addString(".text");
}
diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
index 827e928..bb74f6a 100644
--- a/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
+++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
@@ -54,11 +54,8 @@ unsigned BPFELFObjectWriter::getRelocType(const MCFixup &Fixup,
const MCSymbol &Sym = *A;
if (Sym.isDefined()) {
- MCSection &Section = Sym.getSection();
- const MCSectionELF *SectionELF = dyn_cast<MCSectionELF>(&Section);
- assert(SectionELF && "Null section for reloc symbol");
-
- unsigned Flags = SectionELF->getFlags();
+ auto &Section = static_cast<const MCSectionELF &>(Sym.getSection());
+ unsigned Flags = Section.getFlags();
if (Sym.isTemporary()) {
// .BTF.ext generates FK_Data_4 relocations for
diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp
index a0011e8..fa9007e 100644
--- a/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp
+++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp
@@ -16,7 +16,6 @@
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/Casting.h"
#include "llvm/Support/ErrorHandling.h"
using namespace llvm;
diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
index 7b21684..63d6e6f 100644
--- a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
+++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
@@ -13,18 +13,19 @@
#ifndef LLVM_LIB_TARGET_BPF_MCTARGETDESC_BPFMCASMINFO_H
#define LLVM_LIB_TARGET_BPF_MCTARGETDESC_BPFMCASMINFO_H
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmInfoELF.h"
#include "llvm/TargetParser/Triple.h"
namespace llvm {
-class BPFMCAsmInfo : public MCAsmInfo {
+class BPFMCAsmInfo : public MCAsmInfoELF {
public:
explicit BPFMCAsmInfo(const Triple &TT, const MCTargetOptions &Options) {
if (TT.getArch() == Triple::bpfeb)
IsLittleEndian = false;
PrivateGlobalPrefix = ".L";
+ PrivateLabelPrefix = "L";
WeakRefDirective = "\t.weak\t";
UsesELFSectionDirectiveForBSS = true;
diff --git a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
index d9d9b36..feecfc0 100644
--- a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
+++ b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
@@ -301,41 +301,53 @@ bool DataScalarizerVisitor::visitExtractElementInst(ExtractElementInst &EEI) {
}
bool DataScalarizerVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
- Value *PtrOperand = GEPI.getPointerOperand();
- Type *OrigGEPType = GEPI.getSourceElementType();
- Type *NewGEPType = OrigGEPType;
+ GEPOperator *GOp = cast<GEPOperator>(&GEPI);
+ Value *PtrOperand = GOp->getPointerOperand();
+ Type *NewGEPType = GOp->getSourceElementType();
bool NeedsTransform = false;
+ // Unwrap GEP ConstantExprs to find the base operand and element type
+ while (auto *CE = dyn_cast<ConstantExpr>(PtrOperand)) {
+ if (auto *GEPCE = dyn_cast<GEPOperator>(CE)) {
+ GOp = GEPCE;
+ PtrOperand = GEPCE->getPointerOperand();
+ NewGEPType = GEPCE->getSourceElementType();
+ } else
+ break;
+ }
+
if (GlobalVariable *NewGlobal = lookupReplacementGlobal(PtrOperand)) {
NewGEPType = NewGlobal->getValueType();
PtrOperand = NewGlobal;
NeedsTransform = true;
} else if (AllocaInst *Alloca = dyn_cast<AllocaInst>(PtrOperand)) {
Type *AllocatedType = Alloca->getAllocatedType();
- // Only transform if the allocated type is an array
- if (AllocatedType != OrigGEPType && isa<ArrayType>(AllocatedType)) {
+ if (isa<ArrayType>(AllocatedType) &&
+ AllocatedType != GOp->getResultElementType()) {
NewGEPType = AllocatedType;
NeedsTransform = true;
}
}
- // Scalar geps should remain scalars geps. The dxil-flatten-arrays pass will
- // convert these scalar geps into flattened array geps
- if (!isa<ArrayType>(OrigGEPType))
- NewGEPType = OrigGEPType;
-
- // Note: We bail if this isn't a gep touched via alloca or global
- // transformations
if (!NeedsTransform)
return false;
- IRBuilder<> Builder(&GEPI);
- SmallVector<Value *, MaxVecSize> Indices(GEPI.indices());
+ // Keep scalar GEPs scalar; dxil-flatten-arrays will do flattening later
+ if (!isa<ArrayType>(GOp->getSourceElementType()))
+ NewGEPType = GOp->getSourceElementType();
+ IRBuilder<> Builder(&GEPI);
+ SmallVector<Value *, MaxVecSize> Indices(GOp->indices());
Value *NewGEP = Builder.CreateGEP(NewGEPType, PtrOperand, Indices,
- GEPI.getName(), GEPI.getNoWrapFlags());
- GEPI.replaceAllUsesWith(NewGEP);
- GEPI.eraseFromParent();
+ GOp->getName(), GOp->getNoWrapFlags());
+
+ GOp->replaceAllUsesWith(NewGEP);
+
+ if (auto *CE = dyn_cast<ConstantExpr>(GOp))
+ CE->destroyConstant();
+ else if (auto *OldGEPI = dyn_cast<GetElementPtrInst>(GOp))
+ OldGEPI->eraseFromParent();
+
return true;
}
diff --git a/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp b/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
index f0e2e78..7e1436e 100644
--- a/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
+++ b/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
@@ -263,8 +263,13 @@ bool DXILFlattenArraysVisitor::visitGetElementPtrInst(GetElementPtrInst &GEP) {
// merge the byte offsets. Otherwise, this GEP is itself the root of a GEP
// chain and we need to deterine the root array type
if (auto *PtrOpGEP = dyn_cast<GEPOperator>(PtrOperand)) {
- assert(GEPChainInfoMap.contains(PtrOpGEP) &&
- "Expected parent GEP to be visited before this GEP");
+
+ // If the parent GEP was not processed, then we do not want to process its
+ // descendants. This can happen if the GEP chain is for an unsupported type
+ // such as a struct -- we do not flatten structs nor GEP chains for structs
+ if (!GEPChainInfoMap.contains(PtrOpGEP))
+ return false;
+
GEPInfo &PGEPInfo = GEPChainInfoMap[PtrOpGEP];
Info.RootFlattenedArrayType = PGEPInfo.RootFlattenedArrayType;
Info.RootPointerOperand = PGEPInfo.RootPointerOperand;
diff --git a/llvm/lib/Target/DirectX/DXILLegalizePass.cpp b/llvm/lib/Target/DirectX/DXILLegalizePass.cpp
index c73648f..3427968 100644
--- a/llvm/lib/Target/DirectX/DXILLegalizePass.cpp
+++ b/llvm/lib/Target/DirectX/DXILLegalizePass.cpp
@@ -24,18 +24,19 @@
using namespace llvm;
-static void legalizeFreeze(Instruction &I,
+static bool legalizeFreeze(Instruction &I,
SmallVectorImpl<Instruction *> &ToRemove,
DenseMap<Value *, Value *>) {
auto *FI = dyn_cast<FreezeInst>(&I);
if (!FI)
- return;
+ return false;
FI->replaceAllUsesWith(FI->getOperand(0));
ToRemove.push_back(FI);
+ return true;
}
-static void fixI8UseChain(Instruction &I,
+static bool fixI8UseChain(Instruction &I,
SmallVectorImpl<Instruction *> &ToRemove,
DenseMap<Value *, Value *> &ReplacedValues) {
@@ -74,19 +75,19 @@ static void fixI8UseChain(Instruction &I,
if (Trunc->getDestTy()->isIntegerTy(8)) {
ReplacedValues[Trunc] = Trunc->getOperand(0);
ToRemove.push_back(Trunc);
- return;
+ return true;
}
}
if (auto *Store = dyn_cast<StoreInst>(&I)) {
if (!Store->getValueOperand()->getType()->isIntegerTy(8))
- return;
+ return false;
SmallVector<Value *> NewOperands;
ProcessOperands(NewOperands);
Value *NewStore = Builder.CreateStore(NewOperands[0], NewOperands[1]);
ReplacedValues[Store] = NewStore;
ToRemove.push_back(Store);
- return;
+ return true;
}
if (auto *Load = dyn_cast<LoadInst>(&I);
@@ -104,17 +105,17 @@ static void fixI8UseChain(Instruction &I,
LoadInst *NewLoad = Builder.CreateLoad(ElementType, NewOperands[0]);
ReplacedValues[Load] = NewLoad;
ToRemove.push_back(Load);
- return;
+ return true;
}
if (auto *Load = dyn_cast<LoadInst>(&I);
Load && isa<ConstantExpr>(Load->getPointerOperand())) {
auto *CE = dyn_cast<ConstantExpr>(Load->getPointerOperand());
if (!(CE->getOpcode() == Instruction::GetElementPtr))
- return;
+ return false;
auto *GEP = dyn_cast<GEPOperator>(CE);
if (!GEP->getSourceElementType()->isIntegerTy(8))
- return;
+ return false;
Type *ElementType = Load->getType();
ConstantInt *Offset = dyn_cast<ConstantInt>(GEP->getOperand(1));
@@ -143,12 +144,12 @@ static void fixI8UseChain(Instruction &I,
ReplacedValues[Load] = NewLoad;
Load->replaceAllUsesWith(NewLoad);
ToRemove.push_back(Load);
- return;
+ return true;
}
if (auto *BO = dyn_cast<BinaryOperator>(&I)) {
if (!I.getType()->isIntegerTy(8))
- return;
+ return false;
SmallVector<Value *> NewOperands;
ProcessOperands(NewOperands);
Value *NewInst =
@@ -162,24 +163,24 @@ static void fixI8UseChain(Instruction &I,
}
ReplacedValues[BO] = NewInst;
ToRemove.push_back(BO);
- return;
+ return true;
}
if (auto *Sel = dyn_cast<SelectInst>(&I)) {
if (!I.getType()->isIntegerTy(8))
- return;
+ return false;
SmallVector<Value *> NewOperands;
ProcessOperands(NewOperands);
Value *NewInst = Builder.CreateSelect(Sel->getCondition(), NewOperands[1],
NewOperands[2]);
ReplacedValues[Sel] = NewInst;
ToRemove.push_back(Sel);
- return;
+ return true;
}
if (auto *Cmp = dyn_cast<CmpInst>(&I)) {
if (!Cmp->getOperand(0)->getType()->isIntegerTy(8))
- return;
+ return false;
SmallVector<Value *> NewOperands;
ProcessOperands(NewOperands);
Value *NewInst =
@@ -187,18 +188,18 @@ static void fixI8UseChain(Instruction &I,
Cmp->replaceAllUsesWith(NewInst);
ReplacedValues[Cmp] = NewInst;
ToRemove.push_back(Cmp);
- return;
+ return true;
}
if (auto *Cast = dyn_cast<CastInst>(&I)) {
if (!Cast->getSrcTy()->isIntegerTy(8))
- return;
+ return false;
ToRemove.push_back(Cast);
auto *Replacement = ReplacedValues[Cast->getOperand(0)];
if (Cast->getType() == Replacement->getType()) {
Cast->replaceAllUsesWith(Replacement);
- return;
+ return true;
}
Value *AdjustedCast = nullptr;
@@ -213,7 +214,7 @@ static void fixI8UseChain(Instruction &I,
if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
if (!GEP->getType()->isPointerTy() ||
!GEP->getSourceElementType()->isIntegerTy(8))
- return;
+ return false;
Value *BasePtr = GEP->getPointerOperand();
if (ReplacedValues.count(BasePtr))
@@ -248,15 +249,17 @@ static void fixI8UseChain(Instruction &I,
ReplacedValues[GEP] = NewGEP;
GEP->replaceAllUsesWith(NewGEP);
ToRemove.push_back(GEP);
+ return true;
}
+ return false;
}
-static void upcastI8AllocasAndUses(Instruction &I,
+static bool upcastI8AllocasAndUses(Instruction &I,
SmallVectorImpl<Instruction *> &ToRemove,
DenseMap<Value *, Value *> &ReplacedValues) {
auto *AI = dyn_cast<AllocaInst>(&I);
if (!AI || !AI->getAllocatedType()->isIntegerTy(8))
- return;
+ return false;
Type *SmallestType = nullptr;
@@ -291,16 +294,17 @@ static void upcastI8AllocasAndUses(Instruction &I,
}
if (!SmallestType)
- return; // no valid casts found
+ return false; // no valid casts found
// Replace alloca
IRBuilder<> Builder(AI);
auto *NewAlloca = Builder.CreateAlloca(SmallestType);
ReplacedValues[AI] = NewAlloca;
ToRemove.push_back(AI);
+ return true;
}
-static void
+static bool
downcastI64toI32InsertExtractElements(Instruction &I,
SmallVectorImpl<Instruction *> &ToRemove,
DenseMap<Value *, Value *> &) {
@@ -318,6 +322,7 @@ downcastI64toI32InsertExtractElements(Instruction &I,
Extract->replaceAllUsesWith(NewExtract);
ToRemove.push_back(Extract);
+ return true;
}
}
@@ -335,8 +340,10 @@ downcastI64toI32InsertExtractElements(Instruction &I,
Insert->replaceAllUsesWith(Insert32Index);
ToRemove.push_back(Insert);
+ return true;
}
}
+ return false;
}
static void emitMemcpyExpansion(IRBuilder<> &Builder, Value *Dst, Value *Src,
@@ -453,17 +460,17 @@ static void emitMemsetExpansion(IRBuilder<> &Builder, Value *Dst, Value *Val,
// Expands the instruction `I` into corresponding loads and stores if it is a
// memcpy call. In that case, the call instruction is added to the `ToRemove`
// vector. `ReplacedValues` is unused.
-static void legalizeMemCpy(Instruction &I,
+static bool legalizeMemCpy(Instruction &I,
SmallVectorImpl<Instruction *> &ToRemove,
DenseMap<Value *, Value *> &ReplacedValues) {
CallInst *CI = dyn_cast<CallInst>(&I);
if (!CI)
- return;
+ return false;
Intrinsic::ID ID = CI->getIntrinsicID();
if (ID != Intrinsic::memcpy)
- return;
+ return false;
IRBuilder<> Builder(&I);
Value *Dst = CI->getArgOperand(0);
@@ -476,19 +483,20 @@ static void legalizeMemCpy(Instruction &I,
assert(IsVolatile->getZExtValue() == 0 && "Expected IsVolatile to be false");
emitMemcpyExpansion(Builder, Dst, Src, Length);
ToRemove.push_back(CI);
+ return true;
}
-static void legalizeMemSet(Instruction &I,
+static bool legalizeMemSet(Instruction &I,
SmallVectorImpl<Instruction *> &ToRemove,
DenseMap<Value *, Value *> &ReplacedValues) {
CallInst *CI = dyn_cast<CallInst>(&I);
if (!CI)
- return;
+ return false;
Intrinsic::ID ID = CI->getIntrinsicID();
if (ID != Intrinsic::memset)
- return;
+ return false;
IRBuilder<> Builder(&I);
Value *Dst = CI->getArgOperand(0);
@@ -497,23 +505,25 @@ static void legalizeMemSet(Instruction &I,
assert(Size && "Expected Size to be a ConstantInt");
emitMemsetExpansion(Builder, Dst, Val, Size, ReplacedValues);
ToRemove.push_back(CI);
+ return true;
}
-static void updateFnegToFsub(Instruction &I,
+static bool updateFnegToFsub(Instruction &I,
SmallVectorImpl<Instruction *> &ToRemove,
DenseMap<Value *, Value *> &) {
const Intrinsic::ID ID = I.getOpcode();
if (ID != Instruction::FNeg)
- return;
+ return false;
IRBuilder<> Builder(&I);
Value *In = I.getOperand(0);
Value *Zero = ConstantFP::get(In->getType(), -0.0);
I.replaceAllUsesWith(Builder.CreateFSub(Zero, In));
ToRemove.push_back(&I);
+ return true;
}
-static void
+static bool
legalizeGetHighLowi64Bytes(Instruction &I,
SmallVectorImpl<Instruction *> &ToRemove,
DenseMap<Value *, Value *> &ReplacedValues) {
@@ -523,13 +533,13 @@ legalizeGetHighLowi64Bytes(Instruction &I,
BitCast->getSrcTy()->isIntegerTy(64)) {
ToRemove.push_back(BitCast);
ReplacedValues[BitCast] = BitCast->getOperand(0);
- return;
+ return true;
}
}
if (auto *Extract = dyn_cast<ExtractElementInst>(&I)) {
if (!dyn_cast<BitCastInst>(Extract->getVectorOperand()))
- return;
+ return false;
auto *VecTy = dyn_cast<FixedVectorType>(Extract->getVectorOperandType());
if (VecTy && VecTy->getElementType()->isIntegerTy(32) &&
VecTy->getNumElements() == 2) {
@@ -557,12 +567,14 @@ legalizeGetHighLowi64Bytes(Instruction &I,
}
ToRemove.push_back(Extract);
Extract->replaceAllUsesWith(ReplacedValues[Extract]);
+ return true;
}
}
}
+ return false;
}
-static void
+static bool
legalizeScalarLoadStoreOnArrays(Instruction &I,
SmallVectorImpl<Instruction *> &ToRemove,
DenseMap<Value *, Value *> &) {
@@ -579,14 +591,14 @@ legalizeScalarLoadStoreOnArrays(Instruction &I,
PtrOpIndex = SI->getPointerOperandIndex();
LoadStoreTy = SI->getValueOperand()->getType();
} else
- return;
+ return false;
// If the load/store is not of a single-value type (i.e., scalar or vector)
// then we do not modify it. It shouldn't be a vector either because the
// dxil-data-scalarization pass is expected to run before this, but it's not
// incorrect to apply this transformation to vector load/stores.
if (!LoadStoreTy->isSingleValueType())
- return;
+ return false;
Type *ArrayTy;
if (auto *GlobalVarPtrOp = dyn_cast<GlobalVariable>(PtrOp))
@@ -594,10 +606,10 @@ legalizeScalarLoadStoreOnArrays(Instruction &I,
else if (auto *AllocaPtrOp = dyn_cast<AllocaInst>(PtrOp))
ArrayTy = AllocaPtrOp->getAllocatedType();
else
- return;
+ return false;
if (!isa<ArrayType>(ArrayTy))
- return;
+ return false;
assert(ArrayTy->getArrayElementType() == LoadStoreTy &&
"Expected array element type to be the same as to the scalar load or "
@@ -607,6 +619,7 @@ legalizeScalarLoadStoreOnArrays(Instruction &I,
Value *GEP = GetElementPtrInst::Create(
ArrayTy, PtrOp, {Zero, Zero}, GEPNoWrapFlags::all(), "", I.getIterator());
I.setOperand(PtrOpIndex, GEP);
+ return true;
}
namespace {
@@ -624,13 +637,11 @@ public:
ReplacedValues.clear();
for (auto &I : instructions(F)) {
for (auto &LegalizationFn : LegalizationPipeline[Stage])
- LegalizationFn(I, ToRemove, ReplacedValues);
+ MadeChange |= LegalizationFn(I, ToRemove, ReplacedValues);
}
for (auto *Inst : reverse(ToRemove))
Inst->eraseFromParent();
-
- MadeChange |= !ToRemove.empty();
}
return MadeChange;
}
@@ -639,7 +650,7 @@ private:
enum LegalizationStage { Stage1 = 0, Stage2 = 1, NumStages };
using LegalizationFnTy =
- std::function<void(Instruction &, SmallVectorImpl<Instruction *> &,
+ std::function<bool(Instruction &, SmallVectorImpl<Instruction *> &,
DenseMap<Value *, Value *> &)>;
SmallVector<LegalizationFnTy> LegalizationPipeline[NumStages];
diff --git a/llvm/lib/Target/DirectX/DXILResourceAccess.cpp b/llvm/lib/Target/DirectX/DXILResourceAccess.cpp
index 566f3a9..c33ec0e 100644
--- a/llvm/lib/Target/DirectX/DXILResourceAccess.cpp
+++ b/llvm/lib/Target/DirectX/DXILResourceAccess.cpp
@@ -241,7 +241,6 @@ static void replaceAccess(IntrinsicInst *II, dxil::ResourceTypeInfo &RTI) {
}
static bool transformResourcePointers(Function &F, DXILResourceTypeMap &DRTM) {
- bool Changed = false;
SmallVector<std::pair<IntrinsicInst *, dxil::ResourceTypeInfo>> Resources;
for (BasicBlock &BB : F)
for (Instruction &I : BB)
@@ -254,7 +253,7 @@ static bool transformResourcePointers(Function &F, DXILResourceTypeMap &DRTM) {
for (auto &[II, RI] : Resources)
replaceAccess(II, RI);
- return Changed;
+ return !Resources.empty();
}
PreservedAnalyses DXILResourceAccess::run(Function &F,
diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp
index dfc8162..ebdfcaa 100644
--- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp
+++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp
@@ -16,6 +16,7 @@
#include "llvm/ADT/Twine.h"
#include "llvm/Analysis/DXILMetadataAnalysis.h"
#include "llvm/BinaryFormat/DXContainer.h"
+#include "llvm/Frontend/HLSL/RootSignatureMetadata.h"
#include "llvm/Frontend/HLSL/RootSignatureValidations.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DiagnosticInfo.h"
@@ -29,25 +30,10 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
#include <cstdint>
-#include <optional>
-#include <utility>
using namespace llvm;
using namespace llvm::dxil;
-static bool reportError(LLVMContext *Ctx, Twine Message,
- DiagnosticSeverity Severity = DS_Error) {
- Ctx->diagnose(DiagnosticInfoGeneric(Message, Severity));
- return true;
-}
-
-static bool reportValueError(LLVMContext *Ctx, Twine ParamName,
- uint32_t Value) {
- Ctx->diagnose(DiagnosticInfoGeneric(
- "Invalid value for " + ParamName + ": " + Twine(Value), DS_Error));
- return true;
-}
-
static std::optional<uint32_t> extractMdIntValue(MDNode *Node,
unsigned int OpId) {
if (auto *CI =
@@ -56,453 +42,10 @@ static std::optional<uint32_t> extractMdIntValue(MDNode *Node,
return std::nullopt;
}
-static std::optional<float> extractMdFloatValue(MDNode *Node,
- unsigned int OpId) {
- if (auto *CI = mdconst::dyn_extract<ConstantFP>(Node->getOperand(OpId).get()))
- return CI->getValueAPF().convertToFloat();
- return std::nullopt;
-}
-
-static std::optional<StringRef> extractMdStringValue(MDNode *Node,
- unsigned int OpId) {
- MDString *NodeText = dyn_cast<MDString>(Node->getOperand(OpId));
- if (NodeText == nullptr)
- return std::nullopt;
- return NodeText->getString();
-}
-
-static bool parseRootFlags(LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD,
- MDNode *RootFlagNode) {
-
- if (RootFlagNode->getNumOperands() != 2)
- return reportError(Ctx, "Invalid format for RootFlag Element");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RootFlagNode, 1))
- RSD.Flags = *Val;
- else
- return reportError(Ctx, "Invalid value for RootFlag");
-
- return false;
-}
-
-static bool parseRootConstants(LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD,
- MDNode *RootConstantNode) {
-
- if (RootConstantNode->getNumOperands() != 5)
- return reportError(Ctx, "Invalid format for RootConstants Element");
-
- dxbc::RTS0::v1::RootParameterHeader Header;
- // The parameter offset doesn't matter here - we recalculate it during
- // serialization Header.ParameterOffset = 0;
- Header.ParameterType =
- llvm::to_underlying(dxbc::RootParameterType::Constants32Bit);
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RootConstantNode, 1))
- Header.ShaderVisibility = *Val;
- else
- return reportError(Ctx, "Invalid value for ShaderVisibility");
-
- dxbc::RTS0::v1::RootConstants Constants;
- if (std::optional<uint32_t> Val = extractMdIntValue(RootConstantNode, 2))
- Constants.ShaderRegister = *Val;
- else
- return reportError(Ctx, "Invalid value for ShaderRegister");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RootConstantNode, 3))
- Constants.RegisterSpace = *Val;
- else
- return reportError(Ctx, "Invalid value for RegisterSpace");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RootConstantNode, 4))
- Constants.Num32BitValues = *Val;
- else
- return reportError(Ctx, "Invalid value for Num32BitValues");
-
- RSD.ParametersContainer.addParameter(Header, Constants);
-
- return false;
-}
-
-static bool parseRootDescriptors(LLVMContext *Ctx,
- mcdxbc::RootSignatureDesc &RSD,
- MDNode *RootDescriptorNode,
- RootSignatureElementKind ElementKind) {
- assert(ElementKind == RootSignatureElementKind::SRV ||
- ElementKind == RootSignatureElementKind::UAV ||
- ElementKind == RootSignatureElementKind::CBV &&
- "parseRootDescriptors should only be called with RootDescriptor "
- "element kind.");
- if (RootDescriptorNode->getNumOperands() != 5)
- return reportError(Ctx, "Invalid format for Root Descriptor Element");
-
- dxbc::RTS0::v1::RootParameterHeader Header;
- switch (ElementKind) {
- case RootSignatureElementKind::SRV:
- Header.ParameterType = llvm::to_underlying(dxbc::RootParameterType::SRV);
- break;
- case RootSignatureElementKind::UAV:
- Header.ParameterType = llvm::to_underlying(dxbc::RootParameterType::UAV);
- break;
- case RootSignatureElementKind::CBV:
- Header.ParameterType = llvm::to_underlying(dxbc::RootParameterType::CBV);
- break;
- default:
- llvm_unreachable("invalid Root Descriptor kind");
- break;
- }
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RootDescriptorNode, 1))
- Header.ShaderVisibility = *Val;
- else
- return reportError(Ctx, "Invalid value for ShaderVisibility");
-
- dxbc::RTS0::v2::RootDescriptor Descriptor;
- if (std::optional<uint32_t> Val = extractMdIntValue(RootDescriptorNode, 2))
- Descriptor.ShaderRegister = *Val;
- else
- return reportError(Ctx, "Invalid value for ShaderRegister");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RootDescriptorNode, 3))
- Descriptor.RegisterSpace = *Val;
- else
- return reportError(Ctx, "Invalid value for RegisterSpace");
-
- if (RSD.Version == 1) {
- RSD.ParametersContainer.addParameter(Header, Descriptor);
- return false;
- }
- assert(RSD.Version > 1);
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RootDescriptorNode, 4))
- Descriptor.Flags = *Val;
- else
- return reportError(Ctx, "Invalid value for Root Descriptor Flags");
-
- RSD.ParametersContainer.addParameter(Header, Descriptor);
- return false;
-}
-
-static bool parseDescriptorRange(LLVMContext *Ctx,
- mcdxbc::DescriptorTable &Table,
- MDNode *RangeDescriptorNode) {
-
- if (RangeDescriptorNode->getNumOperands() != 6)
- return reportError(Ctx, "Invalid format for Descriptor Range");
-
- dxbc::RTS0::v2::DescriptorRange Range;
-
- std::optional<StringRef> ElementText =
- extractMdStringValue(RangeDescriptorNode, 0);
-
- if (!ElementText.has_value())
- return reportError(Ctx, "Descriptor Range, first element is not a string.");
-
- Range.RangeType =
- StringSwitch<uint32_t>(*ElementText)
- .Case("CBV", llvm::to_underlying(dxbc::DescriptorRangeType::CBV))
- .Case("SRV", llvm::to_underlying(dxbc::DescriptorRangeType::SRV))
- .Case("UAV", llvm::to_underlying(dxbc::DescriptorRangeType::UAV))
- .Case("Sampler",
- llvm::to_underlying(dxbc::DescriptorRangeType::Sampler))
- .Default(~0U);
-
- if (Range.RangeType == ~0U)
- return reportError(Ctx, "Invalid Descriptor Range type: " + *ElementText);
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 1))
- Range.NumDescriptors = *Val;
- else
- return reportError(Ctx, "Invalid value for Number of Descriptor in Range");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 2))
- Range.BaseShaderRegister = *Val;
- else
- return reportError(Ctx, "Invalid value for BaseShaderRegister");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 3))
- Range.RegisterSpace = *Val;
- else
- return reportError(Ctx, "Invalid value for RegisterSpace");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 4))
- Range.OffsetInDescriptorsFromTableStart = *Val;
- else
- return reportError(Ctx,
- "Invalid value for OffsetInDescriptorsFromTableStart");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 5))
- Range.Flags = *Val;
- else
- return reportError(Ctx, "Invalid value for Descriptor Range Flags");
-
- Table.Ranges.push_back(Range);
- return false;
-}
-
-static bool parseDescriptorTable(LLVMContext *Ctx,
- mcdxbc::RootSignatureDesc &RSD,
- MDNode *DescriptorTableNode) {
- const unsigned int NumOperands = DescriptorTableNode->getNumOperands();
- if (NumOperands < 2)
- return reportError(Ctx, "Invalid format for Descriptor Table");
-
- dxbc::RTS0::v1::RootParameterHeader Header;
- if (std::optional<uint32_t> Val = extractMdIntValue(DescriptorTableNode, 1))
- Header.ShaderVisibility = *Val;
- else
- return reportError(Ctx, "Invalid value for ShaderVisibility");
-
- mcdxbc::DescriptorTable Table;
- Header.ParameterType =
- llvm::to_underlying(dxbc::RootParameterType::DescriptorTable);
-
- for (unsigned int I = 2; I < NumOperands; I++) {
- MDNode *Element = dyn_cast<MDNode>(DescriptorTableNode->getOperand(I));
- if (Element == nullptr)
- return reportError(Ctx, "Missing Root Element Metadata Node.");
-
- if (parseDescriptorRange(Ctx, Table, Element))
- return true;
- }
-
- RSD.ParametersContainer.addParameter(Header, Table);
- return false;
-}
-
-static bool parseStaticSampler(LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD,
- MDNode *StaticSamplerNode) {
- if (StaticSamplerNode->getNumOperands() != 14)
- return reportError(Ctx, "Invalid format for Static Sampler");
-
- dxbc::RTS0::v1::StaticSampler Sampler;
- if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 1))
- Sampler.Filter = *Val;
- else
- return reportError(Ctx, "Invalid value for Filter");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 2))
- Sampler.AddressU = *Val;
- else
- return reportError(Ctx, "Invalid value for AddressU");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 3))
- Sampler.AddressV = *Val;
- else
- return reportError(Ctx, "Invalid value for AddressV");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 4))
- Sampler.AddressW = *Val;
- else
- return reportError(Ctx, "Invalid value for AddressW");
-
- if (std::optional<float> Val = extractMdFloatValue(StaticSamplerNode, 5))
- Sampler.MipLODBias = *Val;
- else
- return reportError(Ctx, "Invalid value for MipLODBias");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 6))
- Sampler.MaxAnisotropy = *Val;
- else
- return reportError(Ctx, "Invalid value for MaxAnisotropy");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 7))
- Sampler.ComparisonFunc = *Val;
- else
- return reportError(Ctx, "Invalid value for ComparisonFunc ");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 8))
- Sampler.BorderColor = *Val;
- else
- return reportError(Ctx, "Invalid value for ComparisonFunc ");
-
- if (std::optional<float> Val = extractMdFloatValue(StaticSamplerNode, 9))
- Sampler.MinLOD = *Val;
- else
- return reportError(Ctx, "Invalid value for MinLOD");
-
- if (std::optional<float> Val = extractMdFloatValue(StaticSamplerNode, 10))
- Sampler.MaxLOD = *Val;
- else
- return reportError(Ctx, "Invalid value for MaxLOD");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 11))
- Sampler.ShaderRegister = *Val;
- else
- return reportError(Ctx, "Invalid value for ShaderRegister");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 12))
- Sampler.RegisterSpace = *Val;
- else
- return reportError(Ctx, "Invalid value for RegisterSpace");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 13))
- Sampler.ShaderVisibility = *Val;
- else
- return reportError(Ctx, "Invalid value for ShaderVisibility");
-
- RSD.StaticSamplers.push_back(Sampler);
- return false;
-}
-
-static bool parseRootSignatureElement(LLVMContext *Ctx,
- mcdxbc::RootSignatureDesc &RSD,
- MDNode *Element) {
- std::optional<StringRef> ElementText = extractMdStringValue(Element, 0);
- if (!ElementText.has_value())
- return reportError(Ctx, "Invalid format for Root Element");
-
- RootSignatureElementKind ElementKind =
- StringSwitch<RootSignatureElementKind>(*ElementText)
- .Case("RootFlags", RootSignatureElementKind::RootFlags)
- .Case("RootConstants", RootSignatureElementKind::RootConstants)
- .Case("RootCBV", RootSignatureElementKind::CBV)
- .Case("RootSRV", RootSignatureElementKind::SRV)
- .Case("RootUAV", RootSignatureElementKind::UAV)
- .Case("DescriptorTable", RootSignatureElementKind::DescriptorTable)
- .Case("StaticSampler", RootSignatureElementKind::StaticSamplers)
- .Default(RootSignatureElementKind::Error);
-
- switch (ElementKind) {
-
- case RootSignatureElementKind::RootFlags:
- return parseRootFlags(Ctx, RSD, Element);
- case RootSignatureElementKind::RootConstants:
- return parseRootConstants(Ctx, RSD, Element);
- case RootSignatureElementKind::CBV:
- case RootSignatureElementKind::SRV:
- case RootSignatureElementKind::UAV:
- return parseRootDescriptors(Ctx, RSD, Element, ElementKind);
- case RootSignatureElementKind::DescriptorTable:
- return parseDescriptorTable(Ctx, RSD, Element);
- case RootSignatureElementKind::StaticSamplers:
- return parseStaticSampler(Ctx, RSD, Element);
- case RootSignatureElementKind::Error:
- return reportError(Ctx, "Invalid Root Signature Element: " + *ElementText);
- }
-
- llvm_unreachable("Unhandled RootSignatureElementKind enum.");
-}
-
-static bool parse(LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD,
- MDNode *Node) {
- bool HasError = false;
-
- // Loop through the Root Elements of the root signature.
- for (const auto &Operand : Node->operands()) {
- MDNode *Element = dyn_cast<MDNode>(Operand);
- if (Element == nullptr)
- return reportError(Ctx, "Missing Root Element Metadata Node.");
-
- HasError = HasError || parseRootSignatureElement(Ctx, RSD, Element);
- }
-
- return HasError;
-}
-
-static bool validate(LLVMContext *Ctx, const mcdxbc::RootSignatureDesc &RSD) {
-
- if (!llvm::hlsl::rootsig::verifyVersion(RSD.Version)) {
- return reportValueError(Ctx, "Version", RSD.Version);
- }
-
- if (!llvm::hlsl::rootsig::verifyRootFlag(RSD.Flags)) {
- return reportValueError(Ctx, "RootFlags", RSD.Flags);
- }
-
- for (const mcdxbc::RootParameterInfo &Info : RSD.ParametersContainer) {
- if (!dxbc::isValidShaderVisibility(Info.Header.ShaderVisibility))
- return reportValueError(Ctx, "ShaderVisibility",
- Info.Header.ShaderVisibility);
-
- assert(dxbc::isValidParameterType(Info.Header.ParameterType) &&
- "Invalid value for ParameterType");
-
- switch (Info.Header.ParameterType) {
-
- case llvm::to_underlying(dxbc::RootParameterType::CBV):
- case llvm::to_underlying(dxbc::RootParameterType::UAV):
- case llvm::to_underlying(dxbc::RootParameterType::SRV): {
- const dxbc::RTS0::v2::RootDescriptor &Descriptor =
- RSD.ParametersContainer.getRootDescriptor(Info.Location);
- if (!llvm::hlsl::rootsig::verifyRegisterValue(Descriptor.ShaderRegister))
- return reportValueError(Ctx, "ShaderRegister",
- Descriptor.ShaderRegister);
-
- if (!llvm::hlsl::rootsig::verifyRegisterSpace(Descriptor.RegisterSpace))
- return reportValueError(Ctx, "RegisterSpace", Descriptor.RegisterSpace);
-
- if (RSD.Version > 1) {
- if (!llvm::hlsl::rootsig::verifyRootDescriptorFlag(RSD.Version,
- Descriptor.Flags))
- return reportValueError(Ctx, "RootDescriptorFlag", Descriptor.Flags);
- }
- break;
- }
- case llvm::to_underlying(dxbc::RootParameterType::DescriptorTable): {
- const mcdxbc::DescriptorTable &Table =
- RSD.ParametersContainer.getDescriptorTable(Info.Location);
- for (const dxbc::RTS0::v2::DescriptorRange &Range : Table) {
- if (!llvm::hlsl::rootsig::verifyRangeType(Range.RangeType))
- return reportValueError(Ctx, "RangeType", Range.RangeType);
-
- if (!llvm::hlsl::rootsig::verifyRegisterSpace(Range.RegisterSpace))
- return reportValueError(Ctx, "RegisterSpace", Range.RegisterSpace);
-
- if (!llvm::hlsl::rootsig::verifyNumDescriptors(Range.NumDescriptors))
- return reportValueError(Ctx, "NumDescriptors", Range.NumDescriptors);
-
- if (!llvm::hlsl::rootsig::verifyDescriptorRangeFlag(
- RSD.Version, Range.RangeType, Range.Flags))
- return reportValueError(Ctx, "DescriptorFlag", Range.Flags);
- }
- break;
- }
- }
- }
-
- for (const dxbc::RTS0::v1::StaticSampler &Sampler : RSD.StaticSamplers) {
- if (!llvm::hlsl::rootsig::verifySamplerFilter(Sampler.Filter))
- return reportValueError(Ctx, "Filter", Sampler.Filter);
-
- if (!llvm::hlsl::rootsig::verifyAddress(Sampler.AddressU))
- return reportValueError(Ctx, "AddressU", Sampler.AddressU);
-
- if (!llvm::hlsl::rootsig::verifyAddress(Sampler.AddressV))
- return reportValueError(Ctx, "AddressV", Sampler.AddressV);
-
- if (!llvm::hlsl::rootsig::verifyAddress(Sampler.AddressW))
- return reportValueError(Ctx, "AddressW", Sampler.AddressW);
-
- if (!llvm::hlsl::rootsig::verifyMipLODBias(Sampler.MipLODBias))
- return reportValueError(Ctx, "MipLODBias", Sampler.MipLODBias);
-
- if (!llvm::hlsl::rootsig::verifyMaxAnisotropy(Sampler.MaxAnisotropy))
- return reportValueError(Ctx, "MaxAnisotropy", Sampler.MaxAnisotropy);
-
- if (!llvm::hlsl::rootsig::verifyComparisonFunc(Sampler.ComparisonFunc))
- return reportValueError(Ctx, "ComparisonFunc", Sampler.ComparisonFunc);
-
- if (!llvm::hlsl::rootsig::verifyBorderColor(Sampler.BorderColor))
- return reportValueError(Ctx, "BorderColor", Sampler.BorderColor);
-
- if (!llvm::hlsl::rootsig::verifyLOD(Sampler.MinLOD))
- return reportValueError(Ctx, "MinLOD", Sampler.MinLOD);
-
- if (!llvm::hlsl::rootsig::verifyLOD(Sampler.MaxLOD))
- return reportValueError(Ctx, "MaxLOD", Sampler.MaxLOD);
-
- if (!llvm::hlsl::rootsig::verifyRegisterValue(Sampler.ShaderRegister))
- return reportValueError(Ctx, "ShaderRegister", Sampler.ShaderRegister);
-
- if (!llvm::hlsl::rootsig::verifyRegisterSpace(Sampler.RegisterSpace))
- return reportValueError(Ctx, "RegisterSpace", Sampler.RegisterSpace);
-
- if (!dxbc::isValidShaderVisibility(Sampler.ShaderVisibility))
- return reportValueError(Ctx, "ShaderVisibility",
- Sampler.ShaderVisibility);
- }
-
- return false;
+static bool reportError(LLVMContext *Ctx, Twine Message,
+ DiagnosticSeverity Severity = DS_Error) {
+ Ctx->diagnose(DiagnosticInfoGeneric(Message, Severity));
+ return true;
}
static SmallDenseMap<const Function *, mcdxbc::RootSignatureDesc>
@@ -584,7 +127,9 @@ analyzeModule(Module &M) {
// static sampler offset is calculated when writting dxcontainer.
RSD.StaticSamplersOffset = 0u;
- if (parse(Ctx, RSD, RootElementListNode) || validate(Ctx, RSD)) {
+ hlsl::rootsig::MetadataParser MDParser(RootElementListNode);
+
+ if (MDParser.ParseRootSignature(Ctx, RSD)) {
return RSDMap;
}
diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.h b/llvm/lib/Target/DirectX/DXILRootSignature.h
index fc39b38..254b7ff 100644
--- a/llvm/lib/Target/DirectX/DXILRootSignature.h
+++ b/llvm/lib/Target/DirectX/DXILRootSignature.h
@@ -26,17 +26,6 @@
namespace llvm {
namespace dxil {
-enum class RootSignatureElementKind {
- Error = 0,
- RootFlags = 1,
- RootConstants = 2,
- SRV = 3,
- UAV = 4,
- CBV = 5,
- DescriptorTable = 6,
- StaticSamplers = 7
-};
-
class RootSignatureBindingInfo {
private:
SmallDenseMap<const Function *, mcdxbc::RootSignatureDesc> FuncToRsMap;
diff --git a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
index eb4adfe..e7e7f2c 100644
--- a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
+++ b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
@@ -106,11 +106,11 @@ void ModuleShaderFlags::updateFunctionFlags(ComputedShaderFlags &CSF,
DXILResourceTypeMap &DRTM,
const ModuleMetadataInfo &MMDI) {
if (!CSF.Doubles)
- CSF.Doubles = I.getType()->isDoubleTy();
+ CSF.Doubles = I.getType()->getScalarType()->isDoubleTy();
if (!CSF.Doubles) {
for (const Value *Op : I.operands()) {
- if (Op->getType()->isDoubleTy()) {
+ if (Op->getType()->getScalarType()->isDoubleTy()) {
CSF.Doubles = true;
break;
}
@@ -130,12 +130,13 @@ void ModuleShaderFlags::updateFunctionFlags(ComputedShaderFlags &CSF,
}
if (!CSF.LowPrecisionPresent)
- CSF.LowPrecisionPresent =
- I.getType()->isIntegerTy(16) || I.getType()->isHalfTy();
+ CSF.LowPrecisionPresent = I.getType()->getScalarType()->isIntegerTy(16) ||
+ I.getType()->getScalarType()->isHalfTy();
if (!CSF.LowPrecisionPresent) {
for (const Value *Op : I.operands()) {
- if (Op->getType()->isIntegerTy(16) || Op->getType()->isHalfTy()) {
+ if (Op->getType()->getScalarType()->isIntegerTy(16) ||
+ Op->getType()->getScalarType()->isHalfTy()) {
CSF.LowPrecisionPresent = true;
break;
}
@@ -150,11 +151,11 @@ void ModuleShaderFlags::updateFunctionFlags(ComputedShaderFlags &CSF,
}
if (!CSF.Int64Ops)
- CSF.Int64Ops = I.getType()->isIntegerTy(64);
+ CSF.Int64Ops = I.getType()->getScalarType()->isIntegerTy(64);
if (!CSF.Int64Ops && !isa<LifetimeIntrinsic>(&I)) {
for (const Value *Op : I.operands()) {
- if (Op->getType()->isIntegerTy(64)) {
+ if (Op->getType()->getScalarType()->isIntegerTy(64)) {
CSF.Int64Ops = true;
break;
}
diff --git a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
index c2eb24b..c34eecd 100644
--- a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -38,7 +38,6 @@
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsHexagon.h"
#include "llvm/IR/Module.h"
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index e915a3c4..613cfb5 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -2385,19 +2385,9 @@ SDValue LoongArchTargetLowering::lowerBF16_TO_FP(SDValue Op,
return Res;
}
-static bool isConstantOrUndef(const SDValue Op) {
- if (Op->isUndef())
- return true;
- if (isa<ConstantSDNode>(Op))
- return true;
- if (isa<ConstantFPSDNode>(Op))
- return true;
- return false;
-}
-
-static bool isConstantOrUndefBUILD_VECTOR(const BuildVectorSDNode *Op) {
+static bool isConstantBUILD_VECTOR(const BuildVectorSDNode *Op) {
for (unsigned i = 0; i < Op->getNumOperands(); ++i)
- if (isConstantOrUndef(Op->getOperand(i)))
+ if (isIntOrFPConstant(Op->getOperand(i)))
return true;
return false;
}
@@ -2505,20 +2495,23 @@ SDValue LoongArchTargetLowering::lowerBUILD_VECTOR(SDValue Op,
if (DAG.isSplatValue(Op, /*AllowUndefs=*/false))
return Op;
- if (!isConstantOrUndefBUILD_VECTOR(Node)) {
+ if (!isConstantBUILD_VECTOR(Node)) {
// Use INSERT_VECTOR_ELT operations rather than expand to stores.
// The resulting code is the same length as the expansion, but it doesn't
// use memory operations.
- EVT ResTy = Node->getValueType(0);
-
assert(ResTy.isVector());
unsigned NumElts = ResTy.getVectorNumElements();
- SDValue Vector =
- DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ResTy, Node->getOperand(0));
+ SDValue Op0 = Node->getOperand(0);
+ SDValue Vector = DAG.getUNDEF(ResTy);
+
+ if (!Op0.isUndef())
+ Vector = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ResTy, Op0);
for (unsigned i = 1; i < NumElts; ++i) {
- Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ResTy, Vector,
- Node->getOperand(i),
+ SDValue Opi = Node->getOperand(i);
+ if (Opi.isUndef())
+ continue;
+ Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ResTy, Vector, Opi,
DAG.getConstant(i, DL, Subtarget.getGRLenVT()));
}
return Vector;
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index feb4eb3..d9680c7 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -969,7 +969,7 @@ void MipsTargetELFStreamer::finish() {
Align Alignment = Section.getAlign();
S.switchSection(&Section);
- if (Section.useCodeAlign())
+ if (getContext().getAsmInfo()->useCodeAlign(Section))
S.emitCodeAlignment(Alignment, &STI, Alignment.value());
else
S.emitValueToAlignment(Alignment, 0, 1, Alignment.value());
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
index 614b321..ce9cd12 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
@@ -15,8 +15,6 @@
using namespace llvm;
-void NVPTXMCAsmInfo::anchor() {}
-
NVPTXMCAsmInfo::NVPTXMCAsmInfo(const Triple &TheTriple,
const MCTargetOptions &Options) {
if (TheTriple.getArch() == Triple::nvptx64) {
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
index 77c4dae..f071406 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
@@ -19,8 +19,6 @@ namespace llvm {
class Triple;
class NVPTXMCAsmInfo : public MCAsmInfo {
- virtual void anchor();
-
public:
explicit NVPTXMCAsmInfo(const Triple &TheTriple,
const MCTargetOptions &Options);
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp
index 9f91143..329e3b5 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp
@@ -97,10 +97,7 @@ void NVPTXTargetStreamer::changeSection(const MCSection *CurSection,
if (isDwarfSection(FI, Section)) {
// Emit DWARF .file directives in the outermost scope.
outputDwarfFileDirectives();
- OS << "\t.section";
- Section->printSwitchToSection(*getStreamer().getContext().getAsmInfo(),
- getStreamer().getContext().getTargetTriple(),
- OS, SubSection);
+ OS << "\t.section\t" << Section->getName() << '\n';
// DWARF sections are enclosed into braces - emit the open one.
OS << "\t{\n";
HasSections = true;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index f2c2f46..ddcecc00 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -952,10 +952,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// promoted to f32. v2f16 is expanded to f16, which is then promoted
// to f32.
for (const auto &Op :
- {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS}) {
+ {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS, ISD::FTANH}) {
setOperationAction(Op, MVT::f16, Promote);
setOperationAction(Op, MVT::f32, Legal);
- setOperationAction(Op, MVT::f64, Legal);
+ // only div/rem/sqrt are legal for f64
+ if (Op == ISD::FDIV || Op == ISD::FREM || Op == ISD::FSQRT) {
+ setOperationAction(Op, MVT::f64, Legal);
+ }
setOperationAction(Op, {MVT::v2f16, MVT::v2bf16, MVT::v2f32}, Expand);
setOperationAction(Op, MVT::bf16, Promote);
AddPromotedToType(Op, MVT::bf16, MVT::f32);
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index b5df4c6..442b900 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1234,7 +1234,7 @@ defm FMA_F32 : FMA<F32RT, allow_ftz = true>;
defm FMA_F32x2 : FMA<F32X2RT, allow_ftz = true, preds = [hasF32x2Instructions]>;
defm FMA_F64 : FMA<F64RT, allow_ftz = false>;
-// sin/cos
+// sin/cos/tanh
class UnaryOpAllowsApproxFn<SDPatternOperator operator>
: PatFrag<(ops node:$A),
@@ -1250,6 +1250,10 @@ def COS_APPROX_f32 :
BasicFlagsNVPTXInst<(outs B32:$dst), (ins B32:$src), (ins FTZFlag:$ftz),
"cos.approx$ftz.f32",
[(set f32:$dst, (UnaryOpAllowsApproxFn<fcos> f32:$src))]>;
+def TANH_APPROX_f32 :
+ BasicNVPTXInst<(outs B32:$dst), (ins B32:$src), "tanh.approx.f32",
+ [(set f32:$dst, (UnaryOpAllowsApproxFn<ftanh> f32:$src))]>,
+ Requires<[hasPTX<70>, hasSM<75>]>;
//-----------------------------------
// Bitwise operations
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
index 8baf866..1af2f9c 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
@@ -220,8 +220,6 @@ bool PPCELFMCAsmInfo::evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr,
return evaluateAsRelocatable(Expr, Res, Asm);
}
-void PPCXCOFFMCAsmInfo::anchor() {}
-
PPCXCOFFMCAsmInfo::PPCXCOFFMCAsmInfo(bool Is64Bit, const Triple &T) {
if (T.getArch() == Triple::ppc64le || T.getArch() == Triple::ppcle)
report_fatal_error("XCOFF is not supported for little-endian targets");
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
index 0f945b3..6af1bd7 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
@@ -33,8 +33,6 @@ public:
};
class PPCXCOFFMCAsmInfo : public MCAsmInfoXCOFF {
- void anchor() override;
-
public:
explicit PPCXCOFFMCAsmInfo(bool is64Bit, const Triple &);
void printSpecifierExpr(raw_ostream &OS,
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index 54497d9..3dad0e8 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -213,7 +213,7 @@ public:
void emitTCEntry(const MCSymbol &S, PPCMCExpr::Specifier Kind) override {
if (const MCSymbolXCOFF *XSym = dyn_cast<MCSymbolXCOFF>(&S)) {
MCSymbolXCOFF *TCSym =
- cast<MCSectionXCOFF>(Streamer.getCurrentSectionOnly())
+ static_cast<const MCSectionXCOFF *>(Streamer.getCurrentSectionOnly())
->getQualNameSymbol();
// On AIX, we have TLS variable offsets (symbol@({gd|ie|le|ld}) depending
// on the TLS access method (or model). For the general-dynamic access
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index a091b21..ce1d51a 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -2274,9 +2274,9 @@ void PPCAIXAsmPrinter::emitLinkage(const GlobalValue *GV,
void PPCAIXAsmPrinter::SetupMachineFunction(MachineFunction &MF) {
// Setup CurrentFnDescSym and its containing csect.
- MCSectionXCOFF *FnDescSec =
- cast<MCSectionXCOFF>(getObjFileLowering().getSectionForFunctionDescriptor(
- &MF.getFunction(), TM));
+ auto *FnDescSec = static_cast<MCSectionXCOFF *>(
+ getObjFileLowering().getSectionForFunctionDescriptor(&MF.getFunction(),
+ TM));
FnDescSec->setAlignment(Align(Subtarget->isPPC64() ? 8 : 4));
CurrentFnDescSym = FnDescSec->getQualNameSymbol();
@@ -2669,9 +2669,9 @@ void PPCAIXAsmPrinter::emitTracebackTable() {
MCSymbol *EHInfoSym =
TargetLoweringObjectFileXCOFF::getEHInfoTableSymbol(MF);
MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(EHInfoSym, TOCType_EHBlock);
- const MCSymbol *TOCBaseSym =
- cast<MCSectionXCOFF>(getObjFileLowering().getTOCBaseSection())
- ->getQualNameSymbol();
+ const MCSymbol *TOCBaseSym = static_cast<const MCSectionXCOFF *>(
+ getObjFileLowering().getTOCBaseSection())
+ ->getQualNameSymbol();
const MCExpr *Exp =
MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCEntry, Ctx),
MCSymbolRefExpr::create(TOCBaseSym, Ctx), Ctx);
@@ -2788,7 +2788,7 @@ void PPCAIXAsmPrinter::emitGlobalVariableHelper(const GlobalVariable *GV) {
}
}
- MCSectionXCOFF *Csect = cast<MCSectionXCOFF>(
+ auto *Csect = static_cast<MCSectionXCOFF *>(
getObjFileLowering().SectionForGlobal(GV, GVKind, TM));
// Switch to the containing csect.
@@ -2869,9 +2869,9 @@ void PPCAIXAsmPrinter::emitFunctionDescriptor() {
OutStreamer->emitValue(MCSymbolRefExpr::create(CurrentFnSym, OutContext),
PointerSize);
// Emit TOC base address.
- const MCSymbol *TOCBaseSym =
- cast<MCSectionXCOFF>(getObjFileLowering().getTOCBaseSection())
- ->getQualNameSymbol();
+ const MCSymbol *TOCBaseSym = static_cast<const MCSectionXCOFF *>(
+ getObjFileLowering().getTOCBaseSection())
+ ->getQualNameSymbol();
OutStreamer->emitValue(MCSymbolRefExpr::create(TOCBaseSym, OutContext),
PointerSize);
// Emit a null environment pointer.
@@ -2996,10 +2996,10 @@ void PPCAIXAsmPrinter::emitEndOfAsmFile(Module &M) {
Name += Prefix;
Name += cast<MCSymbolXCOFF>(I.first.first)->getSymbolTableName();
MCSymbol *S = OutContext.getOrCreateSymbol(Name);
- TCEntry = cast<MCSectionXCOFF>(
+ TCEntry = static_cast<MCSectionXCOFF *>(
getObjFileLowering().getSectionForTOCEntry(S, TM));
} else {
- TCEntry = cast<MCSectionXCOFF>(
+ TCEntry = static_cast<MCSectionXCOFF *>(
getObjFileLowering().getSectionForTOCEntry(I.first.first, TM));
}
OutStreamer->switchSection(TCEntry);
@@ -3054,7 +3054,7 @@ bool PPCAIXAsmPrinter::doInitialization(Module &M) {
return;
SectionKind GOKind = getObjFileLowering().getKindForGlobal(GO, TM);
- MCSectionXCOFF *Csect = cast<MCSectionXCOFF>(
+ auto *Csect = static_cast<MCSectionXCOFF *>(
getObjFileLowering().SectionForGlobal(GO, GOKind, TM));
Align GOAlign = getGVAlignment(GO, GO->getDataLayout());
@@ -3316,9 +3316,9 @@ void PPCAIXAsmPrinter::emitTTypeReference(const GlobalValue *GV,
GlobalType = TOCType_GlobalExternal;
MCSymbol *TypeInfoSym = TM.getSymbol(GV);
MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(TypeInfoSym, GlobalType);
- const MCSymbol *TOCBaseSym =
- cast<MCSectionXCOFF>(getObjFileLowering().getTOCBaseSection())
- ->getQualNameSymbol();
+ const MCSymbol *TOCBaseSym = static_cast<const MCSectionXCOFF *>(
+ getObjFileLowering().getTOCBaseSection())
+ ->getQualNameSymbol();
auto &Ctx = OutStreamer->getContext();
const MCExpr *Exp =
MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCEntry, Ctx),
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index a143d85..d71c42c 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -3849,9 +3849,14 @@ bool RISCVAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
switch (Inst.getOpcode()) {
default:
break;
- case RISCV::PseudoC_ADDI_NOP:
- emitToStreamer(Out, MCInstBuilder(RISCV::C_NOP));
+ case RISCV::PseudoC_ADDI_NOP: {
+ if (Inst.getOperand(2).getImm() == 0)
+ emitToStreamer(Out, MCInstBuilder(RISCV::C_NOP));
+ else
+ emitToStreamer(
+ Out, MCInstBuilder(RISCV::C_NOP_HINT).addOperand(Inst.getOperand(2)));
return false;
+ }
case RISCV::PseudoLLAImm:
case RISCV::PseudoLAImm:
case RISCV::PseudoLI: {
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index fa7bcfa..5e54b82 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -193,21 +193,19 @@ static DecodeStatus DecodeFPR128RegisterClass(MCInst &Inst, uint32_t RegNo,
static DecodeStatus DecodeGPRNoX0RegisterClass(MCInst &Inst, uint32_t RegNo,
uint64_t Address,
const MCDisassembler *Decoder) {
- if (RegNo == 0) {
+ if (RegNo == 0)
return MCDisassembler::Fail;
- }
return DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder);
}
-static DecodeStatus
-DecodeGPRNoX0X2RegisterClass(MCInst &Inst, uint64_t RegNo, uint32_t Address,
- const MCDisassembler *Decoder) {
- if (RegNo == 2) {
+static DecodeStatus DecodeGPRNoX2RegisterClass(MCInst &Inst, uint64_t RegNo,
+ uint32_t Address,
+ const MCDisassembler *Decoder) {
+ if (RegNo == 2)
return MCDisassembler::Fail;
- }
- return DecodeGPRNoX0RegisterClass(Inst, RegNo, Address, Decoder);
+ return DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder);
}
static DecodeStatus DecodeGPRNoX31RegisterClass(MCInst &Inst, uint32_t RegNo,
@@ -536,31 +534,6 @@ static DecodeStatus decodeRTZArg(MCInst &Inst, uint32_t Imm, int64_t Address,
return MCDisassembler::Success;
}
-static DecodeStatus decodeRVCInstrRdRs1ImmZero(MCInst &Inst, uint32_t Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus decodeRVCInstrRdSImm6(MCInst &Inst, uint32_t Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus decodeRVCInstrRdCLUIImm(MCInst &Inst, uint32_t Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus
-decodeRVCInstrRdRs1UImmLog2XLenNonZero(MCInst &Inst, uint32_t Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus decodeRVCInstrRdRs2(MCInst &Inst, uint32_t Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus decodeRVCInstrRdRs1Rs2(MCInst &Inst, uint32_t Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn,
uint64_t Address,
const MCDisassembler *Decoder);
@@ -579,18 +552,6 @@ static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint32_t Insn,
#include "RISCVGenDisassemblerTables.inc"
-static DecodeStatus decodeRVCInstrRdRs1ImmZero(MCInst &Inst, uint32_t Insn,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- DecodeStatus S = MCDisassembler::Success;
- uint32_t Rd = fieldFromInstruction(Insn, 7, 5);
- if (!Check(S, DecodeGPRNoX0RegisterClass(Inst, Rd, Address, Decoder)))
- return MCDisassembler::Fail;
- Inst.addOperand(Inst.getOperand(0));
- Inst.addOperand(MCOperand::createImm(0));
- return S;
-}
-
static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint32_t Insn,
uint64_t Address,
const MCDisassembler *Decoder) {
@@ -601,66 +562,6 @@ static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint32_t Insn,
return MCDisassembler::Success;
}
-static DecodeStatus decodeRVCInstrRdSImm6(MCInst &Inst, uint32_t Insn,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- Inst.addOperand(MCOperand::createReg(RISCV::X0));
- uint32_t Imm =
- fieldFromInstruction(Insn, 12, 1) << 5 | fieldFromInstruction(Insn, 2, 5);
- [[maybe_unused]] DecodeStatus Result =
- decodeSImmOperand<6>(Inst, Imm, Address, Decoder);
- assert(Result == MCDisassembler::Success && "Invalid immediate");
- return MCDisassembler::Success;
-}
-
-static DecodeStatus decodeRVCInstrRdCLUIImm(MCInst &Inst, uint32_t Insn,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- Inst.addOperand(MCOperand::createReg(RISCV::X0));
- uint32_t Imm =
- fieldFromInstruction(Insn, 12, 1) << 5 | fieldFromInstruction(Insn, 2, 5);
- return decodeCLUIImmOperand(Inst, Imm, Address, Decoder);
-}
-
-static DecodeStatus
-decodeRVCInstrRdRs1UImmLog2XLenNonZero(MCInst &Inst, uint32_t Insn,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- Inst.addOperand(MCOperand::createReg(RISCV::X0));
- Inst.addOperand(Inst.getOperand(0));
-
- uint32_t UImm6 =
- fieldFromInstruction(Insn, 12, 1) << 5 | fieldFromInstruction(Insn, 2, 5);
- return decodeUImmLog2XLenNonZeroOperand(Inst, UImm6, Address, Decoder);
-}
-
-static DecodeStatus decodeRVCInstrRdRs2(MCInst &Inst, uint32_t Insn,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- DecodeStatus S = MCDisassembler::Success;
- uint32_t Rd = fieldFromInstruction(Insn, 7, 5);
- uint32_t Rs2 = fieldFromInstruction(Insn, 2, 5);
- if (!Check(S, DecodeGPRRegisterClass(Inst, Rd, Address, Decoder)))
- return MCDisassembler::Fail;
- if (!Check(S, DecodeGPRRegisterClass(Inst, Rs2, Address, Decoder)))
- return MCDisassembler::Fail;
- return S;
-}
-
-static DecodeStatus decodeRVCInstrRdRs1Rs2(MCInst &Inst, uint32_t Insn,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- DecodeStatus S = MCDisassembler::Success;
- uint32_t Rd = fieldFromInstruction(Insn, 7, 5);
- uint32_t Rs2 = fieldFromInstruction(Insn, 2, 5);
- if (!Check(S, DecodeGPRRegisterClass(Inst, Rd, Address, Decoder)))
- return MCDisassembler::Fail;
- Inst.addOperand(Inst.getOperand(0));
- if (!Check(S, DecodeGPRRegisterClass(Inst, Rs2, Address, Decoder)))
- return MCDisassembler::Fail;
- return S;
-}
-
static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn,
uint64_t Address,
const MCDisassembler *Decoder) {
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
index 7ad5d5f..bddea43 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
@@ -330,7 +330,6 @@ enum OperandType : unsigned {
OPERAND_UIMM32,
OPERAND_UIMM48,
OPERAND_UIMM64,
- OPERAND_ZERO,
OPERAND_THREE,
OPERAND_FOUR,
OPERAND_SIMM5,
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
index baa508a..269b117 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
@@ -13,13 +13,7 @@
#include "MCTargetDesc/RISCVAsmBackend.h"
#include "MCTargetDesc/RISCVMCAsmInfo.h"
-#include "RISCVFixupKinds.h"
#include "llvm/BinaryFormat/ELF.h"
-#include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCValue.h"
-#include "llvm/Support/Casting.h"
#include "llvm/Support/ErrorHandling.h"
using namespace llvm;
diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
index d4f5d8f..2f32e2a 100644
--- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -293,7 +293,7 @@ void RISCVAsmPrinter::emitNTLHint(const MachineInstr *MI) {
MCInst Hint;
if (STI->hasStdExtZca())
- Hint.setOpcode(RISCV::C_ADD_HINT);
+ Hint.setOpcode(RISCV::C_ADD);
else
Hint.setOpcode(RISCV::ADD);
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index f9c0b54..171940e 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1272,7 +1272,7 @@ def FeatureVendorXSfmm128t
def FeatureVendorXSfvqmaccdod
: RISCVExtension<1, 0,
"SiFive Int8 Matrix Multiplication Instructions (2-by-8 and 8-by-2)",
- [FeatureStdExtZve32x]>;
+ [FeatureStdExtZve32x, FeatureStdExtZvl128b]>;
def HasVendorXSfvqmaccdod
: Predicate<"Subtarget->hasVendorXSfvqmaccdod()">,
AssemblerPredicate<(all_of FeatureVendorXSfvqmaccdod),
@@ -1281,7 +1281,7 @@ def HasVendorXSfvqmaccdod
def FeatureVendorXSfvqmaccqoq
: RISCVExtension<1, 0,
"SiFive Int8 Matrix Multiplication Instructions (4-by-8 and 8-by-4)",
- [FeatureStdExtZve32x]>;
+ [FeatureStdExtZve32x, FeatureStdExtZvl256b]>;
def HasVendorXSfvqmaccqoq
: Predicate<"Subtarget->hasVendorXSfvqmaccqoq()">,
AssemblerPredicate<(all_of FeatureVendorXSfvqmaccqoq),
@@ -1290,7 +1290,7 @@ def HasVendorXSfvqmaccqoq
def FeatureVendorXSfvfwmaccqqq
: RISCVExtension<1, 0,
"SiFive Matrix Multiply Accumulate Instruction (4-by-4)",
- [FeatureStdExtZvfbfmin]>;
+ [FeatureStdExtZvfbfmin, FeatureStdExtZvl128b]>;
def HasVendorXSfvfwmaccqqq
: Predicate<"Subtarget->hasVendorXSfvfwmaccqqq()">,
AssemblerPredicate<(all_of FeatureVendorXSfvfwmaccqqq),
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 3918dd2..54845e5 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1618,6 +1618,12 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
}
}
+ // Customize load and store operation for bf16 if zfh isn't enabled.
+ if (Subtarget.hasVendorXAndesBFHCvt() && !Subtarget.hasStdExtZfh()) {
+ setOperationAction(ISD::LOAD, MVT::bf16, Custom);
+ setOperationAction(ISD::STORE, MVT::bf16, Custom);
+ }
+
// Function alignments.
const Align FunctionAlignment(Subtarget.hasStdExtZca() ? 2 : 4);
setMinFunctionAlignment(FunctionAlignment);
@@ -7216,6 +7222,47 @@ static SDValue SplitStrictFPVectorOp(SDValue Op, SelectionDAG &DAG) {
return DAG.getMergeValues({V, HiRes.getValue(1)}, DL);
}
+SDValue
+RISCVTargetLowering::lowerXAndesBfHCvtBFloat16Load(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Subtarget.hasVendorXAndesBFHCvt() && !Subtarget.hasStdExtZfh() &&
+ "Unexpected bfloat16 load lowering");
+
+ SDLoc DL(Op);
+ LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
+ EVT MemVT = LD->getMemoryVT();
+ SDValue Load = DAG.getExtLoad(
+ ISD::ZEXTLOAD, DL, Subtarget.getXLenVT(), LD->getChain(),
+ LD->getBasePtr(),
+ EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()),
+ LD->getMemOperand());
+ // Using mask to make bf16 nan-boxing valid when we don't have flh
+ // instruction. -65536 would be treat as a small number and thus it can be
+ // directly used lui to get the constant.
+ SDValue mask = DAG.getSignedConstant(-65536, DL, Subtarget.getXLenVT());
+ SDValue OrSixteenOne =
+ DAG.getNode(ISD::OR, DL, Load.getValueType(), {Load, mask});
+ SDValue ConvertedResult =
+ DAG.getNode(RISCVISD::NDS_FMV_BF16_X, DL, MVT::bf16, OrSixteenOne);
+ return DAG.getMergeValues({ConvertedResult, Load.getValue(1)}, DL);
+}
+
+SDValue
+RISCVTargetLowering::lowerXAndesBfHCvtBFloat16Store(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Subtarget.hasVendorXAndesBFHCvt() && !Subtarget.hasStdExtZfh() &&
+ "Unexpected bfloat16 store lowering");
+
+ StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
+ SDLoc DL(Op);
+ SDValue FMV = DAG.getNode(RISCVISD::NDS_FMV_X_ANYEXTBF16, DL,
+ Subtarget.getXLenVT(), ST->getValue());
+ return DAG.getTruncStore(
+ ST->getChain(), DL, FMV, ST->getBasePtr(),
+ EVT::getIntegerVT(*DAG.getContext(), ST->getMemoryVT().getSizeInBits()),
+ ST->getMemOperand());
+}
+
SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
@@ -7914,6 +7961,9 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
return DAG.getMergeValues({Pair, Chain}, DL);
}
+ if (VT == MVT::bf16)
+ return lowerXAndesBfHCvtBFloat16Load(Op, DAG);
+
// Handle normal vector tuple load.
if (VT.isRISCVVectorTuple()) {
SDLoc DL(Op);
@@ -7998,6 +8048,10 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
{Store->getChain(), Lo, Hi, Store->getBasePtr()}, MVT::i64,
Store->getMemOperand());
}
+
+ if (VT == MVT::bf16)
+ return lowerXAndesBfHCvtBFloat16Store(Op, DAG);
+
// Handle normal vector tuple store.
if (VT.isRISCVVectorTuple()) {
SDLoc DL(Op);
@@ -16079,7 +16133,7 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
uint64_t MulAmt = CNode->getZExtValue();
// Don't do this if the Xqciac extension is enabled and the MulAmt in simm12.
- if (Subtarget.hasVendorXqciac() && isInt<12>(MulAmt))
+ if (Subtarget.hasVendorXqciac() && isInt<12>(CNode->getSExtValue()))
return SDValue();
const bool HasShlAdd = Subtarget.hasStdExtZba() ||
@@ -16184,10 +16238,12 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
// 2^N - 3/5/9 --> (sub (shl X, C1), (shXadd X, x))
for (uint64_t Offset : {3, 5, 9}) {
if (isPowerOf2_64(MulAmt + Offset)) {
+ unsigned ShAmt = Log2_64(MulAmt + Offset);
+ if (ShAmt >= VT.getSizeInBits())
+ continue;
SDLoc DL(N);
SDValue Shift1 =
- DAG.getNode(ISD::SHL, DL, VT, X,
- DAG.getConstant(Log2_64(MulAmt + Offset), DL, VT));
+ DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShAmt, DL, VT));
SDValue Mul359 =
DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
DAG.getConstant(Log2_64(Offset - 1), DL, VT), X);
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index f0447e0..ca70c46 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -578,6 +578,9 @@ private:
SDValue lowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerPARTIAL_REDUCE_MLA(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerXAndesBfHCvtBFloat16Load(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerXAndesBfHCvtBFloat16Store(SDValue Op, SelectionDAG &DAG) const;
+
bool isEligibleForTailCallOptimization(
CCState &CCInfo, CallLoweringInfo &CLI, MachineFunction &MF,
const SmallVector<CCValAssign, 16> &ArgLocs) const;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 64f9e3e..085064e 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -2859,9 +2859,6 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI,
case RISCVOp::OPERAND_UIMM16_NONZERO:
Ok = isUInt<16>(Imm) && (Imm != 0);
break;
- case RISCVOp::OPERAND_ZERO:
- Ok = Imm == 0;
- break;
case RISCVOp::OPERAND_THREE:
Ok = Imm == 3;
break;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
index 8252a9b..c5551fb 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
@@ -57,12 +57,6 @@ def simm6nonzero : RISCVOp,
}];
}
-def immzero : RISCVOp,
- ImmLeaf<XLenVT, [{return (Imm == 0);}]> {
- let ParserMatchClass = ImmZeroAsmOperand;
- let OperandType = "OPERAND_ZERO";
-}
-
def CLUIImmAsmOperand : AsmOperandClass {
let Name = "CLUIImm";
let RenderMethod = "addImmOperands";
@@ -272,7 +266,7 @@ class Bcz<bits<3> funct3, string OpcodeStr>
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
class Shift_right<bits<2> funct2, string OpcodeStr>
: RVInst16CB<0b100, 0b01, (outs GPRC:$rd),
- (ins GPRC:$rs1, uimmlog2xlennonzero:$imm),
+ (ins GPRC:$rs1, uimmlog2xlen:$imm),
OpcodeStr, "$rs1, $imm"> {
let Constraints = "$rs1 = $rd";
let Inst{12} = imm{5};
@@ -402,17 +396,19 @@ def C_NOP : RVInst16CI<0b000, 0b01, (outs), (ins), "c.nop", "">,
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
def C_ADDI : RVInst16CI<0b000, 0b01, (outs GPRNoX0:$rd_wb),
- (ins GPRNoX0:$rd, simm6nonzero:$imm),
+ (ins GPRNoX0:$rd, simm6:$imm),
"c.addi", "$rd, $imm">,
Sched<[WriteIALU, ReadIALU]> {
let Constraints = "$rd = $rd_wb";
}
-// Alternate syntax for c.nop. Converted to C_NOP by the assembler.
+// Alternate syntax for c.nop. Converted to C_NOP/C_NOP_HINT by the assembler.
let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCodeGenOnly = 0,
isAsmParserOnly = 1 in
-def PseudoC_ADDI_NOP : Pseudo<(outs GPRX0:$rd), (ins GPRX0:$rs1, immzero:$imm),
- [], "c.addi", "$rd, $imm">;
+def PseudoC_ADDI_NOP : Pseudo<(outs GPRX0:$rd), (ins GPRX0:$rs1, simm6:$imm),
+ [], "c.addi", "$rd, $imm"> {
+ let Constraints = "$rs1 = $rd";
+}
let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCall = 1,
DecoderNamespace = "RV32Only", Defs = [X1],
@@ -430,7 +426,7 @@ def C_ADDIW : RVInst16CI<0b001, 0b01, (outs GPRNoX0:$rd_wb),
}
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-def C_LI : RVInst16CI<0b010, 0b01, (outs GPRNoX0:$rd), (ins simm6:$imm),
+def C_LI : RVInst16CI<0b010, 0b01, (outs GPR:$rd), (ins simm6:$imm),
"c.li", "$rd, $imm">,
Sched<[WriteIALU]>;
@@ -449,7 +445,7 @@ def C_ADDI16SP : RVInst16CI<0b011, 0b01, (outs SP:$rd_wb),
}
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-def C_LUI : RVInst16CI<0b011, 0b01, (outs GPRNoX0X2:$rd),
+def C_LUI : RVInst16CI<0b011, 0b01, (outs GPRNoX2:$rd),
(ins c_lui_imm:$imm),
"c.lui", "$rd, $imm">,
Sched<[WriteIALU]>;
@@ -497,8 +493,8 @@ def C_BEQZ : Bcz<0b110, "c.beqz">, Sched<[WriteJmp, ReadJmp]>;
def C_BNEZ : Bcz<0b111, "c.bnez">, Sched<[WriteJmp, ReadJmp]>;
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-def C_SLLI : RVInst16CI<0b000, 0b10, (outs GPRNoX0:$rd_wb),
- (ins GPRNoX0:$rd, uimmlog2xlennonzero:$imm),
+def C_SLLI : RVInst16CI<0b000, 0b10, (outs GPR:$rd_wb),
+ (ins GPR:$rd, uimmlog2xlen:$imm),
"c.slli", "$rd, $imm">,
Sched<[WriteShiftImm, ReadShiftImm]> {
let Constraints = "$rd = $rd_wb";
@@ -544,7 +540,7 @@ def C_JR : RVInst16CR<0b1000, 0b10, (outs), (ins GPRNoX0:$rs1),
let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isMoveReg = 1,
isAsCheapAsAMove = 1 in
-def C_MV : RVInst16CR<0b1000, 0b10, (outs GPRNoX0:$rs1), (ins GPRNoX0:$rs2),
+def C_MV : RVInst16CR<0b1000, 0b10, (outs GPR:$rs1), (ins GPRNoX0:$rs2),
"c.mv", "$rs1, $rs2">,
Sched<[WriteIALU, ReadIALU]>;
@@ -557,8 +553,8 @@ def C_JALR : RVInst16CR<0b1001, 0b10, (outs), (ins GPRNoX0:$rs1),
"c.jalr", "$rs1">, Sched<[WriteJalr, ReadJalr]>;
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-def C_ADD : RVInst16CR<0b1001, 0b10, (outs GPRNoX0:$rd),
- (ins GPRNoX0:$rs1, GPRNoX0:$rs2),
+def C_ADD : RVInst16CR<0b1001, 0b10, (outs GPR:$rd),
+ (ins GPR:$rs1, GPRNoX0:$rs2),
"c.add", "$rs1, $rs2">,
Sched<[WriteIALU, ReadIALU, ReadIALU]> {
let Constraints = "$rs1 = $rd";
@@ -616,81 +612,6 @@ def C_NOP_HINT : RVInst16CI<0b000, 0b01, (outs), (ins simm6nonzero:$imm),
let rd = 0;
}
-def C_ADDI_HINT_IMM_ZERO : RVInst16CI<0b000, 0b01, (outs GPRNoX0:$rd_wb),
- (ins GPRNoX0:$rd, immzero:$imm),
- "c.addi", "$rd, $imm">,
- Sched<[WriteIALU, ReadIALU]> {
- let Constraints = "$rd = $rd_wb";
- let imm = 0;
- let DecoderMethod = "decodeRVCInstrRdRs1ImmZero";
-}
-
-def C_LI_HINT : RVInst16CI<0b010, 0b01, (outs GPRX0:$rd), (ins simm6:$imm),
- "c.li", "$rd, $imm">,
- Sched<[WriteIALU]> {
- let Inst{11-7} = 0;
- let DecoderMethod = "decodeRVCInstrRdSImm6";
-}
-
-def C_LUI_HINT : RVInst16CI<0b011, 0b01, (outs GPRX0:$rd),
- (ins c_lui_imm:$imm),
- "c.lui", "$rd, $imm">,
- Sched<[WriteIALU]> {
- let Inst{11-7} = 0;
- let DecoderMethod = "decodeRVCInstrRdCLUIImm";
-}
-
-def C_MV_HINT : RVInst16CR<0b1000, 0b10, (outs GPRX0:$rs1), (ins GPRNoX0:$rs2),
- "c.mv", "$rs1, $rs2">, Sched<[WriteIALU, ReadIALU]> {
- let Inst{11-7} = 0;
- let DecoderMethod = "decodeRVCInstrRdRs2";
-}
-
-def C_ADD_HINT : RVInst16CR<0b1001, 0b10, (outs GPRX0:$rd),
- (ins GPRX0:$rs1, GPRNoX0:$rs2),
- "c.add", "$rs1, $rs2">,
- Sched<[WriteIALU, ReadIALU, ReadIALU]> {
- let Constraints = "$rs1 = $rd";
- let Inst{11-7} = 0;
- let DecoderMethod = "decodeRVCInstrRdRs1Rs2";
-}
-
-def C_SLLI_HINT : RVInst16CI<0b000, 0b10, (outs GPRX0:$rd_wb),
- (ins GPRX0:$rd, uimmlog2xlennonzero:$imm),
- "c.slli", "$rd, $imm">,
- Sched<[WriteShiftImm, ReadShiftImm]> {
- let Constraints = "$rd = $rd_wb";
- let Inst{11-7} = 0;
- let DecoderMethod = "decodeRVCInstrRdRs1UImmLog2XLenNonZero";
-}
-
-def C_SLLI64_HINT : RVInst16CI<0b000, 0b10, (outs GPR:$rd_wb), (ins GPR:$rd),
- "c.slli64", "$rd">,
- Sched<[WriteShiftImm, ReadShiftImm]> {
- let Constraints = "$rd = $rd_wb";
- let imm = 0;
-}
-
-def C_SRLI64_HINT : RVInst16CB<0b100, 0b01, (outs GPRC:$rd),
- (ins GPRC:$rs1),
- "c.srli64", "$rs1">,
- Sched<[WriteShiftImm, ReadShiftImm]> {
- let Constraints = "$rs1 = $rd";
- let Inst{6-2} = 0;
- let Inst{11-10} = 0b00;
- let Inst{12} = 0;
-}
-
-def C_SRAI64_HINT : RVInst16CB<0b100, 0b01, (outs GPRC:$rd),
- (ins GPRC:$rs1),
- "c.srai64", "$rs1">,
- Sched<[WriteShiftImm, ReadShiftImm]> {
- let Constraints = "$rs1 = $rd";
- let Inst{6-2} = 0;
- let Inst{11-10} = 0b01;
- let Inst{12} = 0;
-}
-
} // Predicates = [HasStdExtZca], hasSideEffects = 0, mayLoad = 0,
// mayStore = 0
@@ -699,15 +620,17 @@ def C_SRAI64_HINT : RVInst16CB<0b100, 0b01, (outs GPRC:$rd),
//===----------------------------------------------------------------------===//
let Predicates = [HasStdExtZca] in {
-// Just a different syntax for the c.nop hint: c.addi x0, simm6 vs c.nop simm6.
-def : InstAlias<"c.addi x0, $imm", (C_NOP_HINT simm6nonzero:$imm), 0>;
+// Legacy aliases.
+def : InstAlias<"c.slli64 $rd", (C_SLLI GPR:$rd, 0), 0>;
+def : InstAlias<"c.srli64 $rs1", (C_SRLI GPRC:$rs1, 0), 0>;
+def : InstAlias<"c.srai64 $rs1", (C_SRAI GPRC:$rs1, 0), 0>;
}
let Predicates = [HasStdExtC, HasStdExtZihintntl] in {
-def : InstAlias<"c.ntl.p1", (C_ADD_HINT X0, X2)>;
-def : InstAlias<"c.ntl.pall", (C_ADD_HINT X0, X3)>;
-def : InstAlias<"c.ntl.s1", (C_ADD_HINT X0, X4)>;
-def : InstAlias<"c.ntl.all", (C_ADD_HINT X0, X5)>;
+def : InstAlias<"c.ntl.p1", (C_ADD X0, X2)>;
+def : InstAlias<"c.ntl.pall", (C_ADD X0, X3)>;
+def : InstAlias<"c.ntl.s1", (C_ADD X0, X4)>;
+def : InstAlias<"c.ntl.all", (C_ADD X0, X5)>;
} // Predicates = [HasStdExtC, HasStdExtZihintntl]
let EmitPriority = 0 in {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
index 7f1077a..dd365cf 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
@@ -89,39 +89,41 @@ class PLI_B_i<bits<8> funct8, string opcodestr>
}
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class RVPUnary<bits<3> f, string opcodestr, dag operands, string argstr>
- : RVInstIBase<0b010, OPC_OP_IMM_32, (outs GPR:$rd), operands, opcodestr, argstr> {
+class RVPShift_ri<bits<3> f, bits<3> funct3, string opcodestr, Operand ImmType>
+ : RVInstIBase<funct3, OPC_OP_IMM_32, (outs GPR:$rd),
+ (ins GPR:$rs1, ImmType:$shamt), opcodestr,
+ "$rd, $rs1, $shamt"> {
let Inst{31} = 0b1;
let Inst{30-28} = f;
let Inst{27} = 0b0;
}
-class RVPUnaryImm5<bits<3> f, string opcodestr>
- : RVPUnary<f, opcodestr, (ins GPR:$rs1, uimm5:$uimm5), "$rd, $rs1, $uimm5"> {
- bits<5> uimm5;
+class RVPShiftW_ri<bits<3> f, bits<3> funct3, string opcodestr>
+ : RVPShift_ri<f, funct3, opcodestr, uimm5> {
+ bits<5> shamt;
let Inst{26-25} = 0b01;
- let Inst{24-20} = uimm5;
+ let Inst{24-20} = shamt;
}
-class RVPUnaryImm4<bits<3> f, string opcodestr>
- : RVPUnary<f, opcodestr, (ins GPR:$rs1, uimm4:$uimm4), "$rd, $rs1, $uimm4"> {
- bits<4> uimm4;
+class RVPShiftH_ri<bits<3> f, bits<3> funct3, string opcodestr>
+ : RVPShift_ri<f, funct3, opcodestr, uimm4> {
+ bits<4> shamt;
let Inst{26-24} = 0b001;
- let Inst{23-20} = uimm4;
+ let Inst{23-20} = shamt;
}
-class RVPUnaryImm3<bits<3> f, string opcodestr>
- : RVPUnary<f, opcodestr, (ins GPR:$rs1, uimm3:$uimm3), "$rd, $rs1, $uimm3"> {
- bits<3> uimm3;
+class RVPShiftB_ri<bits<3> f, bits<3> funct3, string opcodestr>
+ : RVPShift_ri<f, funct3, opcodestr, uimm3> {
+ bits<3> shamt;
let Inst{26-23} = 0b0001;
- let Inst{22-20} = uimm3;
+ let Inst{22-20} = shamt;
}
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class RVPUnaryWUF<bits<2> w, bits<5> uf, string opcodestr>
+class RVPUnary_ri<bits<2> w, bits<5> uf, string opcodestr>
: RVInstIBase<0b010, OPC_OP_IMM_32, (outs GPR:$rd), (ins GPR:$rs1),
opcodestr, "$rd, $rs1"> {
let Inst{31-27} = 0b11100;
@@ -149,16 +151,16 @@ def ABSW : UnaryW_r<0b011000000111, 0b001, "absw">;
} // Predicates = [HasStdExtP, IsRV64]
let Predicates = [HasStdExtP] in {
-def PSLLI_B : RVPUnaryImm3<0b000, "pslli.b">;
-def PSLLI_H : RVPUnaryImm4<0b000, "pslli.h">;
-def PSSLAI_H : RVPUnaryImm4<0b101, "psslai.h">;
+def PSLLI_B : RVPShiftB_ri<0b000, 0b010, "pslli.b">;
+def PSLLI_H : RVPShiftH_ri<0b000, 0b010, "pslli.h">;
+def PSSLAI_H : RVPShiftH_ri<0b101, 0b010, "psslai.h">;
} // Predicates = [HasStdExtP]
let DecoderNamespace = "RV32Only",
Predicates = [HasStdExtP, IsRV32] in
-def SSLAI : RVPUnaryImm5<0b101, "sslai">;
+def SSLAI : RVPShiftW_ri<0b101, 0b010, "sslai">;
let Predicates = [HasStdExtP, IsRV64] in {
-def PSLLI_W : RVPUnaryImm5<0b000, "pslli.w">;
-def PSSLAI_W : RVPUnaryImm5<0b101, "psslai.w">;
+def PSLLI_W : RVPShiftW_ri<0b000, 0b010, "pslli.w">;
+def PSSLAI_W : RVPShiftW_ri<0b101, 0b010, "psslai.w">;
} // Predicates = [HasStdExtP, IsRV64]
let Predicates = [HasStdExtP] in
@@ -169,13 +171,13 @@ let Predicates = [HasStdExtP] in
def PLI_B : PLI_B_i<0b10110100, "pli.b">;
let Predicates = [HasStdExtP] in {
-def PSEXT_H_B : RVPUnaryWUF<0b00, 0b00100, "psext.h.b">;
-def PSABS_H : RVPUnaryWUF<0b00, 0b00111, "psabs.h">;
-def PSABS_B : RVPUnaryWUF<0b10, 0b00111, "psabs.b">;
+def PSEXT_H_B : RVPUnary_ri<0b00, 0b00100, "psext.h.b">;
+def PSABS_H : RVPUnary_ri<0b00, 0b00111, "psabs.h">;
+def PSABS_B : RVPUnary_ri<0b10, 0b00111, "psabs.b">;
} // Predicates = [HasStdExtP]
let Predicates = [HasStdExtP, IsRV64] in {
-def PSEXT_W_B : RVPUnaryWUF<0b01, 0b00100, "psext.w.b">;
-def PSEXT_W_H : RVPUnaryWUF<0b01, 0b00101, "psext.w.h">;
+def PSEXT_W_B : RVPUnary_ri<0b01, 0b00100, "psext.w.b">;
+def PSEXT_W_H : RVPUnary_ri<0b01, 0b00101, "psext.w.h">;
} // Predicates = [HasStdExtP, IsRV64]
let Predicates = [HasStdExtP] in
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index dfa532a..6afc942d 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -788,7 +788,7 @@ class VPseudoUSLoadNoMask<VReg RetClass,
DAGOperand sewop = sew> :
RISCVVPseudo<(outs RetClass:$rd),
(ins RetClass:$dest, GPRMemZeroOffset:$rs1, AVL:$vl,
- sewop:$sew, vec_policy:$policy), []>,
+ sewop:$sew, vec_policy:$policy)>,
RISCVVLE</*Masked*/0, /*Strided*/0, /*FF*/0, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -804,7 +804,7 @@ class VPseudoUSLoadMask<VReg RetClass,
RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
(ins GetVRegNoV0<RetClass>.R:$passthru,
GPRMemZeroOffset:$rs1, VMaskOp:$vm, AVL:$vl, sew:$sew,
- vec_policy:$policy), []>,
+ vec_policy:$policy)>,
RISCVVLE</*Masked*/1, /*Strided*/0, /*FF*/0, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -821,7 +821,7 @@ class VPseudoUSLoadFFNoMask<VReg RetClass,
int EEW> :
RISCVVPseudo<(outs RetClass:$rd, GPR:$vl),
(ins RetClass:$dest, GPRMemZeroOffset:$rs1, AVL:$avl,
- sew:$sew, vec_policy:$policy), []>,
+ sew:$sew, vec_policy:$policy)>,
RISCVVLE</*Masked*/0, /*Strided*/0, /*FF*/1, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -837,7 +837,7 @@ class VPseudoUSLoadFFMask<VReg RetClass,
RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd, GPR:$vl),
(ins GetVRegNoV0<RetClass>.R:$passthru,
GPRMemZeroOffset:$rs1, VMaskOp:$vm, AVL:$avl, sew:$sew,
- vec_policy:$policy), []>,
+ vec_policy:$policy)>,
RISCVVLE</*Masked*/1, /*Strided*/0, /*FF*/1, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -854,7 +854,7 @@ class VPseudoSLoadNoMask<VReg RetClass,
int EEW> :
RISCVVPseudo<(outs RetClass:$rd),
(ins RetClass:$dest, GPRMemZeroOffset:$rs1, GPR:$rs2,
- AVL:$vl, sew:$sew, vec_policy:$policy), []>,
+ AVL:$vl, sew:$sew, vec_policy:$policy)>,
RISCVVLE</*Masked*/0, /*Strided*/1, /*FF*/0, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -870,7 +870,7 @@ class VPseudoSLoadMask<VReg RetClass,
RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
(ins GetVRegNoV0<RetClass>.R:$passthru,
GPRMemZeroOffset:$rs1, GPR:$rs2, VMaskOp:$vm, AVL:$vl,
- sew:$sew, vec_policy:$policy), []>,
+ sew:$sew, vec_policy:$policy)>,
RISCVVLE</*Masked*/1, /*Strided*/1, /*FF*/0, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -892,7 +892,7 @@ class VPseudoILoadNoMask<VReg RetClass,
bits<2> TargetConstraintType = 1> :
RISCVVPseudo<(outs RetClass:$rd),
(ins RetClass:$dest, GPRMemZeroOffset:$rs1, IdxClass:$rs2,
- AVL:$vl, sew:$sew, vec_policy:$policy), []>,
+ AVL:$vl, sew:$sew, vec_policy:$policy)>,
RISCVVLX</*Masked*/0, Ordered, !logtwo(EEW), VLMul, LMUL> {
let mayLoad = 1;
let mayStore = 0;
@@ -914,7 +914,7 @@ class VPseudoILoadMask<VReg RetClass,
RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
(ins GetVRegNoV0<RetClass>.R:$passthru,
GPRMemZeroOffset:$rs1, IdxClass:$rs2, VMaskOp:$vm,
- AVL:$vl, sew:$sew, vec_policy:$policy), []>,
+ AVL:$vl, sew:$sew, vec_policy:$policy)>,
RISCVVLX</*Masked*/1, Ordered, !logtwo(EEW), VLMul, LMUL> {
let mayLoad = 1;
let mayStore = 0;
@@ -933,7 +933,7 @@ class VPseudoUSStoreNoMask<VReg StClass,
DAGOperand sewop = sew> :
RISCVVPseudo<(outs),
(ins StClass:$rd, GPRMemZeroOffset:$rs1, AVL:$vl,
- sewop:$sew), []>,
+ sewop:$sew)>,
RISCVVSE</*Masked*/0, /*Strided*/0, !logtwo(EEW), VLMul> {
let mayLoad = 0;
let mayStore = 1;
@@ -946,7 +946,7 @@ class VPseudoUSStoreMask<VReg StClass,
int EEW> :
RISCVVPseudo<(outs),
(ins StClass:$rd, GPRMemZeroOffset:$rs1,
- VMaskOp:$vm, AVL:$vl, sew:$sew), []>,
+ VMaskOp:$vm, AVL:$vl, sew:$sew)>,
RISCVVSE</*Masked*/1, /*Strided*/0, !logtwo(EEW), VLMul> {
let mayLoad = 0;
let mayStore = 1;
@@ -960,7 +960,7 @@ class VPseudoSStoreNoMask<VReg StClass,
int EEW> :
RISCVVPseudo<(outs),
(ins StClass:$rd, GPRMemZeroOffset:$rs1, GPR:$rs2,
- AVL:$vl, sew:$sew), []>,
+ AVL:$vl, sew:$sew)>,
RISCVVSE</*Masked*/0, /*Strided*/1, !logtwo(EEW), VLMul> {
let mayLoad = 0;
let mayStore = 1;
@@ -973,7 +973,7 @@ class VPseudoSStoreMask<VReg StClass,
int EEW> :
RISCVVPseudo<(outs),
(ins StClass:$rd, GPRMemZeroOffset:$rs1, GPR:$rs2,
- VMaskOp:$vm, AVL:$vl, sew:$sew), []>,
+ VMaskOp:$vm, AVL:$vl, sew:$sew)>,
RISCVVSE</*Masked*/1, /*Strided*/1, !logtwo(EEW), VLMul> {
let mayLoad = 0;
let mayStore = 1;
@@ -986,7 +986,7 @@ class VPseudoSStoreMask<VReg StClass,
class VPseudoNullaryNoMask<VReg RegClass> :
RISCVVPseudo<(outs RegClass:$rd),
(ins RegClass:$passthru,
- AVL:$vl, sew:$sew, vec_policy:$policy), []> {
+ AVL:$vl, sew:$sew, vec_policy:$policy)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1015,7 +1015,7 @@ class VPseudoNullaryMask<VReg RegClass> :
// Nullary for pseudo instructions. They are expanded in
// RISCVExpandPseudoInsts pass.
class VPseudoNullaryPseudoM<string BaseInst> :
- RISCVVPseudo<(outs VR:$rd), (ins AVL:$vl, sew_mask:$sew), []> {
+ RISCVVPseudo<(outs VR:$rd), (ins AVL:$vl, sew_mask:$sew)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1031,7 +1031,7 @@ class VPseudoUnaryNoMask<DAGOperand RetClass,
bits<2> TargetConstraintType = 1> :
RISCVVPseudo<(outs RetClass:$rd),
(ins RetClass:$passthru, OpClass:$rs2,
- AVL:$vl, sew:$sew, vec_policy:$policy), []> {
+ AVL:$vl, sew:$sew, vec_policy:$policy)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1047,7 +1047,7 @@ class VPseudoUnaryNoMaskNoPolicy<DAGOperand RetClass,
string Constraint = "",
bits<2> TargetConstraintType = 1> :
RISCVVPseudo<(outs RetClass:$rd),
- (ins OpClass:$rs2, AVL:$vl, sew_mask:$sew), []> {
+ (ins OpClass:$rs2, AVL:$vl, sew_mask:$sew)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1063,7 +1063,7 @@ class VPseudoUnaryNoMaskRoundingMode<DAGOperand RetClass,
bits<2> TargetConstraintType = 1> :
RISCVVPseudo<(outs RetClass:$rd),
(ins RetClass:$passthru, OpClass:$rs2, vec_rm:$rm,
- AVL:$vl, sew:$sew, vec_policy:$policy), []> {
+ AVL:$vl, sew:$sew, vec_policy:$policy)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1084,7 +1084,7 @@ class VPseudoUnaryMask<VReg RetClass,
DAGOperand sewop = sew> :
RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
(ins GetVRegNoV0<RetClass>.R:$passthru, OpClass:$rs2,
- VMaskOp:$vm, AVL:$vl, sewop:$sew, vec_policy:$policy), []> {
+ VMaskOp:$vm, AVL:$vl, sewop:$sew, vec_policy:$policy)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1104,7 +1104,7 @@ class VPseudoUnaryMaskRoundingMode<VReg RetClass,
RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
(ins GetVRegNoV0<RetClass>.R:$passthru, OpClass:$rs2,
VMaskOp:$vm, vec_rm:$rm,
- AVL:$vl, sew:$sew, vec_policy:$policy), []> {
+ AVL:$vl, sew:$sew, vec_policy:$policy)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1139,7 +1139,7 @@ class VPseudoUnaryMask_NoExcept<VReg RetClass,
class VPseudoUnaryNoMaskGPROut :
RISCVVPseudo<(outs GPR:$rd),
- (ins VR:$rs2, AVL:$vl, sew_mask:$sew), []> {
+ (ins VR:$rs2, AVL:$vl, sew_mask:$sew)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1149,7 +1149,7 @@ class VPseudoUnaryNoMaskGPROut :
class VPseudoUnaryMaskGPROut :
RISCVVPseudo<(outs GPR:$rd),
- (ins VR:$rs1, VMaskOp:$vm, AVL:$vl, sew_mask:$sew), []> {
+ (ins VR:$rs1, VMaskOp:$vm, AVL:$vl, sew_mask:$sew)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1163,7 +1163,7 @@ class VPseudoUnaryAnyMask<VReg RetClass,
VReg Op1Class> :
RISCVVPseudo<(outs RetClass:$rd),
(ins RetClass:$passthru, Op1Class:$rs2,
- VR:$vm, AVL:$vl, sew:$sew), []> {
+ VR:$vm, AVL:$vl, sew:$sew)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1197,7 +1197,7 @@ class VPseudoBinaryNoMaskPolicy<VReg RetClass,
bits<2> TargetConstraintType = 1> :
RISCVVPseudo<(outs RetClass:$rd),
(ins RetClass:$passthru, Op1Class:$rs2, Op2Class:$rs1,
- AVL:$vl, sew:$sew, vec_policy:$policy), []> {
+ AVL:$vl, sew:$sew, vec_policy:$policy)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1241,7 +1241,7 @@ class VPseudoBinaryMaskPolicyRoundingMode<VReg RetClass,
(ins GetVRegNoV0<RetClass>.R:$passthru,
Op1Class:$rs2, Op2Class:$rs1,
VMaskOp:$vm, vec_rm:$rm, AVL:$vl,
- sew:$sew, vec_policy:$policy), []> {
+ sew:$sew, vec_policy:$policy)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1266,7 +1266,7 @@ class VPseudoTiedBinaryNoMask<VReg RetClass,
bits<2> TargetConstraintType = 1> :
RISCVVPseudo<(outs RetClass:$rd),
(ins RetClass:$rs2, Op2Class:$rs1, AVL:$vl, sew:$sew,
- vec_policy:$policy), []> {
+ vec_policy:$policy)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1288,7 +1288,7 @@ class VPseudoTiedBinaryNoMaskRoundingMode<VReg RetClass,
(ins RetClass:$rs2, Op2Class:$rs1,
vec_rm:$rm,
AVL:$vl, sew:$sew,
- vec_policy:$policy), []> {
+ vec_policy:$policy)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1380,7 +1380,7 @@ class VPseudoTernaryMaskPolicyRoundingMode<VReg RetClass,
Op1Class:$rs2, Op2Class:$rs1,
VMaskOp:$vm,
vec_rm:$rm,
- AVL:$vl, sew:$sew, vec_policy:$policy), []> {
+ AVL:$vl, sew:$sew, vec_policy:$policy)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1451,7 +1451,7 @@ class VPseudoTiedBinaryMaskRoundingMode<VReg RetClass,
Op2Class:$rs1,
VMaskOp:$vm,
vec_rm:$rm,
- AVL:$vl, sew:$sew, vec_policy:$policy), []> {
+ AVL:$vl, sew:$sew, vec_policy:$policy)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1480,7 +1480,7 @@ class VPseudoBinaryCarry<VReg RetClass,
(ins Op1Class:$rs2, Op2Class:$rs1,
VMV0:$carry, AVL:$vl, sew:$sew),
(ins Op1Class:$rs2, Op2Class:$rs1,
- AVL:$vl, sew:$sew)), []> {
+ AVL:$vl, sew:$sew))> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1498,7 +1498,7 @@ class VPseudoTiedBinaryCarryIn<VReg RetClass,
bits<2> TargetConstraintType = 1> :
RISCVVPseudo<(outs RetClass:$rd),
(ins RetClass:$passthru, Op1Class:$rs2, Op2Class:$rs1,
- VMV0:$carry, AVL:$vl, sew:$sew), []> {
+ VMV0:$carry, AVL:$vl, sew:$sew)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1516,7 +1516,7 @@ class VPseudoTernaryNoMask<VReg RetClass,
string Constraint> :
RISCVVPseudo<(outs RetClass:$rd),
(ins RetClass:$rs3, Op1Class:$rs1, Op2Class:$rs2,
- AVL:$vl, sew:$sew), []> {
+ AVL:$vl, sew:$sew)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1532,7 +1532,7 @@ class VPseudoTernaryNoMaskWithPolicy<VReg RetClass,
bits<2> TargetConstraintType = 1> :
RISCVVPseudo<(outs RetClass:$rd),
(ins RetClass:$rs3, Op1Class:$rs1, Op2Class:$rs2,
- AVL:$vl, sew:$sew, vec_policy:$policy), []> {
+ AVL:$vl, sew:$sew, vec_policy:$policy)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1570,7 +1570,7 @@ class VPseudoUSSegLoadNoMask<VReg RetClass,
bits<4> NF> :
RISCVVPseudo<(outs RetClass:$rd),
(ins RetClass:$dest, GPRMemZeroOffset:$rs1, AVL:$vl,
- sew:$sew, vec_policy:$policy), []>,
+ sew:$sew, vec_policy:$policy)>,
RISCVVLSEG<NF, /*Masked*/0, /*Strided*/0, /*FF*/0, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -1587,7 +1587,7 @@ class VPseudoUSSegLoadMask<VReg RetClass,
RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
(ins GetVRegNoV0<RetClass>.R:$passthru,
GPRMemZeroOffset:$rs1, VMaskOp:$vm, AVL:$vl, sew:$sew,
- vec_policy:$policy), []>,
+ vec_policy:$policy)>,
RISCVVLSEG<NF, /*Masked*/1, /*Strided*/0, /*FF*/0, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -1605,7 +1605,7 @@ class VPseudoUSSegLoadFFNoMask<VReg RetClass,
bits<4> NF> :
RISCVVPseudo<(outs RetClass:$rd, GPR:$vl),
(ins RetClass:$dest, GPRMemZeroOffset:$rs1, AVL:$avl,
- sew:$sew, vec_policy:$policy), []>,
+ sew:$sew, vec_policy:$policy)>,
RISCVVLSEG<NF, /*Masked*/0, /*Strided*/0, /*FF*/1, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -1622,7 +1622,7 @@ class VPseudoUSSegLoadFFMask<VReg RetClass,
RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd, GPR:$vl),
(ins GetVRegNoV0<RetClass>.R:$passthru,
GPRMemZeroOffset:$rs1, VMaskOp:$vm, AVL:$avl, sew:$sew,
- vec_policy:$policy), []>,
+ vec_policy:$policy)>,
RISCVVLSEG<NF, /*Masked*/1, /*Strided*/0, /*FF*/1, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -1640,7 +1640,7 @@ class VPseudoSSegLoadNoMask<VReg RetClass,
bits<4> NF> :
RISCVVPseudo<(outs RetClass:$rd),
(ins RetClass:$passthru, GPRMemZeroOffset:$rs1, GPR:$offset,
- AVL:$vl, sew:$sew, vec_policy:$policy), []>,
+ AVL:$vl, sew:$sew, vec_policy:$policy)>,
RISCVVLSEG<NF, /*Masked*/0, /*Strided*/1, /*FF*/0, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -1657,7 +1657,7 @@ class VPseudoSSegLoadMask<VReg RetClass,
RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
(ins GetVRegNoV0<RetClass>.R:$passthru,
GPRMemZeroOffset:$rs1, GPR:$offset, VMaskOp:$vm,
- AVL:$vl, sew:$sew, vec_policy:$policy), []>,
+ AVL:$vl, sew:$sew, vec_policy:$policy)>,
RISCVVLSEG<NF, /*Masked*/1, /*Strided*/1, /*FF*/0, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -1679,7 +1679,7 @@ class VPseudoISegLoadNoMask<VReg RetClass,
RISCVVPseudo<(outs RetClass:$rd),
(ins RetClass:$passthru, GPRMemZeroOffset:$rs1,
IdxClass:$offset, AVL:$vl, sew:$sew,
- vec_policy:$policy), []>,
+ vec_policy:$policy)>,
RISCVVLXSEG<NF, /*Masked*/0, Ordered, !logtwo(EEW), VLMul, LMUL> {
let mayLoad = 1;
let mayStore = 0;
@@ -1701,7 +1701,7 @@ class VPseudoISegLoadMask<VReg RetClass,
RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
(ins GetVRegNoV0<RetClass>.R:$passthru,
GPRMemZeroOffset:$rs1, IdxClass:$offset, VMaskOp:$vm,
- AVL:$vl, sew:$sew, vec_policy:$policy), []>,
+ AVL:$vl, sew:$sew, vec_policy:$policy)>,
RISCVVLXSEG<NF, /*Masked*/1, Ordered, !logtwo(EEW), VLMul, LMUL> {
let mayLoad = 1;
let mayStore = 0;
@@ -1735,7 +1735,7 @@ class VPseudoUSSegStoreMask<VReg ValClass,
bits<4> NF> :
RISCVVPseudo<(outs),
(ins ValClass:$rd, GPRMemZeroOffset:$rs1,
- VMaskOp:$vm, AVL:$vl, sew:$sew), []>,
+ VMaskOp:$vm, AVL:$vl, sew:$sew)>,
RISCVVSSEG<NF, /*Masked*/1, /*Strided*/0, !logtwo(EEW), VLMul> {
let mayLoad = 0;
let mayStore = 1;
@@ -1750,7 +1750,7 @@ class VPseudoSSegStoreNoMask<VReg ValClass,
bits<4> NF> :
RISCVVPseudo<(outs),
(ins ValClass:$rd, GPRMemZeroOffset:$rs1, GPR:$offset,
- AVL:$vl, sew:$sew), []>,
+ AVL:$vl, sew:$sew)>,
RISCVVSSEG<NF, /*Masked*/0, /*Strided*/1, !logtwo(EEW), VLMul> {
let mayLoad = 0;
let mayStore = 1;
@@ -1764,7 +1764,7 @@ class VPseudoSSegStoreMask<VReg ValClass,
bits<4> NF> :
RISCVVPseudo<(outs),
(ins ValClass:$rd, GPRMemZeroOffset:$rs1, GPR: $offset,
- VMaskOp:$vm, AVL:$vl, sew:$sew), []>,
+ VMaskOp:$vm, AVL:$vl, sew:$sew)>,
RISCVVSSEG<NF, /*Masked*/1, /*Strided*/1, !logtwo(EEW), VLMul> {
let mayLoad = 0;
let mayStore = 1;
@@ -1782,7 +1782,7 @@ class VPseudoISegStoreNoMask<VReg ValClass,
bit Ordered> :
RISCVVPseudo<(outs),
(ins ValClass:$rd, GPRMemZeroOffset:$rs1, IdxClass: $index,
- AVL:$vl, sew:$sew), []>,
+ AVL:$vl, sew:$sew)>,
RISCVVSXSEG<NF, /*Masked*/0, Ordered, !logtwo(EEW), VLMul, LMUL> {
let mayLoad = 0;
let mayStore = 1;
@@ -1799,7 +1799,7 @@ class VPseudoISegStoreMask<VReg ValClass,
bit Ordered> :
RISCVVPseudo<(outs),
(ins ValClass:$rd, GPRMemZeroOffset:$rs1, IdxClass: $index,
- VMaskOp:$vm, AVL:$vl, sew:$sew), []>,
+ VMaskOp:$vm, AVL:$vl, sew:$sew)>,
RISCVVSXSEG<NF, /*Masked*/1, Ordered, !logtwo(EEW), VLMul, LMUL> {
let mayLoad = 0;
let mayStore = 1;
@@ -6703,7 +6703,7 @@ let Predicates = [HasVInstructions] in {
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
let HasSEWOp = 1, BaseInstr = VMV_X_S in
def PseudoVMV_X_S:
- RISCVVPseudo<(outs GPR:$rd), (ins VR:$rs2, sew:$sew), []>,
+ RISCVVPseudo<(outs GPR:$rd), (ins VR:$rs2, sew:$sew)>,
Sched<[WriteVMovXS, ReadVMovXS]>;
let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VMV_S_X, isReMaterializable = 1,
Constraints = "$rd = $passthru" in
@@ -6723,8 +6723,7 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
foreach f = FPList in {
let HasSEWOp = 1, BaseInstr = VFMV_F_S in
def "PseudoVFMV_" # f.FX # "_S" :
- RISCVVPseudo<(outs f.fprclass:$rd),
- (ins VR:$rs2, sew:$sew), []>,
+ RISCVVPseudo<(outs f.fprclass:$rd), (ins VR:$rs2, sew:$sew)>,
Sched<[WriteVMovFS, ReadVMovFS]>;
let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VFMV_S_F, isReMaterializable = 1,
Constraints = "$rd = $passthru" in
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
index 1bb67f4..c75addd9 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
@@ -11,6 +11,20 @@
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
+// RISC-V specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDT_NDS_FMV_BF16_X
+ : SDTypeProfile<1, 1, [SDTCisVT<0, bf16>, SDTCisVT<1, XLenVT>]>;
+def SDT_NDS_FMV_X_ANYEXTBF16
+ : SDTypeProfile<1, 1, [SDTCisVT<0, XLenVT>, SDTCisVT<1, bf16>]>;
+
+def riscv_nds_fmv_bf16_x
+ : SDNode<"RISCVISD::NDS_FMV_BF16_X", SDT_NDS_FMV_BF16_X>;
+def riscv_nds_fmv_x_anyextbf16
+ : SDNode<"RISCVISD::NDS_FMV_X_ANYEXTBF16", SDT_NDS_FMV_X_ANYEXTBF16>;
+
+//===----------------------------------------------------------------------===//
// Operand and SDNode transformation definitions.
//===----------------------------------------------------------------------===//
@@ -773,6 +787,25 @@ def : Pat<(bf16 (fpround FPR32:$rs)),
(NDS_FCVT_BF16_S FPR32:$rs)>;
} // Predicates = [HasVendorXAndesBFHCvt]
+let isCodeGenOnly = 1 in {
+def NDS_FMV_BF16_X : FPUnaryOp_r<0b1111000, 0b00000, 0b000, FPR16, GPR, "fmv.w.x">,
+ Sched<[WriteFMovI32ToF32, ReadFMovI32ToF32]>;
+def NDS_FMV_X_BF16 : FPUnaryOp_r<0b1110000, 0b00000, 0b000, GPR, FPR16, "fmv.x.w">,
+ Sched<[WriteFMovF32ToI32, ReadFMovF32ToI32]>;
+}
+
+let Predicates = [HasVendorXAndesBFHCvt] in {
+def : Pat<(riscv_nds_fmv_bf16_x GPR:$src), (NDS_FMV_BF16_X GPR:$src)>;
+def : Pat<(riscv_nds_fmv_x_anyextbf16 (bf16 FPR16:$src)),
+ (NDS_FMV_X_BF16 (bf16 FPR16:$src))>;
+} // Predicates = [HasVendorXAndesBFHCvt]
+
+// Use flh/fsh to load/store bf16 if zfh is enabled.
+let Predicates = [HasStdExtZfh, HasVendorXAndesBFHCvt] in {
+def : LdPat<load, FLH, bf16>;
+def : StPat<store, FSH, FPR16, bf16>;
+} // Predicates = [HasStdExtZfh, HasVendorXAndesBFHCvt]
+
let Predicates = [HasVendorXAndesVBFHCvt] in {
defm PseudoNDS_VFWCVT_S_BF16 : VPseudoVWCVT_S_BF16;
defm PseudoNDS_VFNCVT_BF16_S : VPseudoVNCVT_BF16_S;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td
index f173440..ed1a60a 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td
@@ -291,31 +291,31 @@ def : CompressPat<(MUL GPRC:$rs1, GPRC:$rs2, GPRC:$rs1),
let Predicates = [HasStdExtZcb, HasStdExtZbb] in{
def : CompressPat<(SEXT_B GPRC:$rs1, GPRC:$rs1),
- (C_SEXT_B GPRC:$rs1, GPRC:$rs1)>;
+ (C_SEXT_B GPRC:$rs1)>;
def : CompressPat<(SEXT_H GPRC:$rs1, GPRC:$rs1),
- (C_SEXT_H GPRC:$rs1, GPRC:$rs1)>;
+ (C_SEXT_H GPRC:$rs1)>;
} // Predicates = [HasStdExtZcb, HasStdExtZbb]
let Predicates = [HasStdExtZcb, HasStdExtZbb] in{
def : CompressPat<(ZEXT_H_RV32 GPRC:$rs1, GPRC:$rs1),
- (C_ZEXT_H GPRC:$rs1, GPRC:$rs1)>;
+ (C_ZEXT_H GPRC:$rs1)>;
def : CompressPat<(ZEXT_H_RV64 GPRC:$rs1, GPRC:$rs1),
- (C_ZEXT_H GPRC:$rs1, GPRC:$rs1)>;
+ (C_ZEXT_H GPRC:$rs1)>;
} // Predicates = [HasStdExtZcb, HasStdExtZbb]
let Predicates = [HasStdExtZcb] in{
def : CompressPat<(ANDI GPRC:$rs1, GPRC:$rs1, 255),
- (C_ZEXT_B GPRC:$rs1, GPRC:$rs1)>;
+ (C_ZEXT_B GPRC:$rs1)>;
} // Predicates = [HasStdExtZcb]
let Predicates = [HasStdExtZcb, HasStdExtZba, IsRV64] in{
def : CompressPat<(ADD_UW GPRC:$rs1, GPRC:$rs1, X0),
- (C_ZEXT_W GPRC:$rs1, GPRC:$rs1)>;
+ (C_ZEXT_W GPRC:$rs1)>;
} // Predicates = [HasStdExtZcb, HasStdExtZba, IsRV64]
let Predicates = [HasStdExtZcb] in{
def : CompressPat<(XORI GPRC:$rs1, GPRC:$rs1, -1),
- (C_NOT GPRC:$rs1, GPRC:$rs1)>;
+ (C_NOT GPRC:$rs1)>;
}
let Predicates = [HasStdExtZcb] in{
diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
index 0565fcd..30d8f85 100644
--- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
@@ -224,10 +224,10 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
Value *Offset = ConstantInt::get(XLenTy, Indices[0] * ScalarSizeInBytes);
Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset);
- // Note: Same VL as above, but i32 not xlen due to signature of
- // vp.strided.load
- VL = Builder.CreateElementCount(Builder.getInt32Ty(),
- VTy->getElementCount());
+ // For rv64, need to truncate i64 to i32 to match signature. As VL is at most
+ // the number of active lanes (which is bounded by i32) this is safe.
+ VL = Builder.CreateTrunc(VL, Builder.getInt32Ty());
+
CallInst *CI =
Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load,
{VTy, BasePtr->getType(), Stride->getType()},
@@ -302,10 +302,9 @@ bool RISCVTargetLowering::lowerInterleavedStore(Instruction *Store,
Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
Value *Offset = ConstantInt::get(XLenTy, Index * ScalarSizeInBytes);
Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset);
- // Note: Same VL as above, but i32 not xlen due to signature of
- // vp.strided.store
- VL = Builder.CreateElementCount(Builder.getInt32Ty(),
- VTy->getElementCount());
+ // For rv64, need to truncate i64 to i32 to match signature. As VL is at
+ // most the number of active lanes (which is bounded by i32) this is safe.
+ VL = Builder.CreateTrunc(VL, Builder.getInt32Ty());
CallInst *CI =
Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_store,
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
index e87f452..ccb39e8 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
@@ -268,6 +268,11 @@ def GPRNoX0 : GPRRegisterClass<(sub GPR, X0)> {
let DiagnosticString = "register must be a GPR excluding zero (x0)";
}
+def GPRNoX2 : GPRRegisterClass<(sub GPR, X2)> {
+ let DiagnosticType = "InvalidRegClassGPRNoX2";
+ let DiagnosticString = "register must be a GPR excluding sp (x2)";
+}
+
def GPRNoX0X2 : GPRRegisterClass<(sub GPR, X0, X2)> {
let DiagnosticType = "InvalidRegClassGPRNoX0X2";
let DiagnosticString = "register must be a GPR excluding zero (x0) and sp (x2)";
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
index 3e286a7..bf23812 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
@@ -24,6 +24,67 @@ class SMX60IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit isF = 0
bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW));
}
+defvar SMX60VLEN = 256;
+defvar SMX60DLEN = !div(SMX60VLEN, 2);
+
+class Get1248Latency<string mx> {
+ int c = !cond(
+ !eq(mx, "M2") : 2,
+ !eq(mx, "M4") : 4,
+ !eq(mx, "M8") : 8,
+ true: 1
+ );
+}
+
+// Used for: logical opsz, shifts, sign ext, merge/move, FP sign/recip/convert, mask ops, slides
+class Get4816Latency<string mx> {
+ int c = !cond(
+ !eq(mx, "M4") : 8,
+ !eq(mx, "M8") : 16,
+ true: 4
+ );
+}
+
+// Used for: arithmetic (add/sub/min/max), saturating/averaging, FP add/sub/min/max
+class Get458Latency<string mx> {
+ int c = !cond(
+ !eq(mx, "M4") : 5,
+ !eq(mx, "M8") : 8,
+ true: 4
+ );
+}
+
+// Widening scaling pattern (4,4,4,4,5,8,8): plateaus at higher LMULs
+// Used for: widening operations
+class Get4588Latency<string mx> {
+ int c = !cond(
+ !eq(mx, "M2") : 5,
+ !eq(mx, "M4") : 8,
+ !eq(mx, "M8") : 8, // M8 not supported for most widening, fallback
+ true: 4
+ );
+}
+
+// Used for: mask-producing comparisons, carry ops with mask, FP comparisons
+class Get461018Latency<string mx> {
+ int c = !cond(
+ !eq(mx, "M2") : 6,
+ !eq(mx, "M4") : 10,
+ !eq(mx, "M8") : 18,
+ true: 4
+ );
+}
+
+// Used for: e64 multiply pattern, complex ops
+class Get781632Latency<string mx> {
+ int c = !cond(
+ !eq(mx, "M2") : 8,
+ !eq(mx, "M4") : 16,
+ !eq(mx, "M8") : 32,
+ true: 7
+ );
+}
+
def SpacemitX60Model : SchedMachineModel {
let IssueWidth = 2; // dual-issue
let MicroOpBufferSize = 0; // in-order
@@ -322,58 +383,96 @@ foreach LMul = [1, 2, 4, 8] in {
foreach mx = SchedMxList in {
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
- defm "" : LMULWriteResMX<"WriteVIALUV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIALUX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIALUI", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVExtV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVICALUV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVICALUX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVICALUI", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVICALUMV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVICALUMX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVICALUMI", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVICmpV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVICmpX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVICmpI", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMergeV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMergeX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMergeI", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMovV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMovX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMovI", [SMX60_VIEU], mx, IsWorstCase>;
-
- defm "" : LMULWriteResMX<"WriteVShiftV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVShiftX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVShiftI", [SMX60_VIEU], mx, IsWorstCase>;
-
- defm "" : LMULWriteResMX<"WriteVIMulV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMulX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMulAddX", [SMX60_VIEU], mx, IsWorstCase>;
+ let Latency = Get458Latency<mx>.c, ReleaseAtCycles = [4] in {
+ defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SMX60_VIEU], mx, IsWorstCase>;
+ }
+
+ let Latency = Get4816Latency<mx>.c, ReleaseAtCycles = [4] in {
+ // Pattern of vadd, vsub, vrsub: 4/4/5/8
+ // Pattern of vand, vor, vxor: 4/4/8/16
+ // They are grouped together, so we used the worst case 4/4/8/16
+ // TODO: use InstRW to override individual instructions' scheduling data
+ defm "" : LMULWriteResMX<"WriteVIALUV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIALUX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIALUI", [SMX60_VIEU], mx, IsWorstCase>;
+
+ defm "" : LMULWriteResMX<"WriteVExtV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMergeV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMergeX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMergeI", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMovV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMovX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMovI", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVShiftV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVShiftX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVShiftI", [SMX60_VIEU], mx, IsWorstCase>;
+
+ defm "" : LMULWriteResMX<"WriteVICALUV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUI", [SMX60_VIEU], mx, IsWorstCase>;
+ }
+
+ let Latency = Get461018Latency<mx>.c, ReleaseAtCycles = [4] in {
+ defm "" : LMULWriteResMX<"WriteVICALUMV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUMX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUMI", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICmpV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICmpX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICmpI", [SMX60_VIEU], mx, IsWorstCase>;
+ }
+
+ // Pattern of vmacc, vmadd, vmul, vmulh, etc.: e8/e16 = 4/4/5/8, e32 = 5,5,5,8,
+ // e64 = 7,8,16,32. We use the worst-case until we can split the SEW.
+ // TODO: change WriteVIMulV, etc to be defined with LMULSEWSchedWrites
+ let Latency = Get781632Latency<mx>.c, ReleaseAtCycles = [7] in {
+ defm "" : LMULWriteResMX<"WriteVIMulV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMulX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMulAddX", [SMX60_VIEU], mx, IsWorstCase>;
+ }
}
// Widening
+// Pattern of vwmul, vwmacc, etc: e8/e16 = 4/4/5/8, e32 = 5,5,5,8
+// We use the worst-case for all.
foreach mx = SchedMxListW in {
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c;
- defm "" : LMULWriteResMX<"WriteVIWALUV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIWALUX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIWALUI", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIWMulV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIWMulX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIWMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIWMulAddX", [SMX60_VIEU], mx, IsWorstCase>;
+ let Latency = Get4588Latency<mx>.c, ReleaseAtCycles = [4] in {
+ defm "" : LMULWriteResMX<"WriteVIWALUV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWALUX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWALUI", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWMulV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWMulX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWMulAddX", [SMX60_VIEU], mx, IsWorstCase>;
+ }
}
-// Vector Integer Division and Remainder
+// Division and remainder operations
+// Pattern of vdivu: 11/11/11/20/40/80/160
+// Pattern of vdiv: 12/12/12/22/44/88/176
+// Pattern of vremu: 12/12/12/22/44/88/176
+// Pattern of vrem: 13/13/13/24/48/96/192
+// We use for all: 12/12/12/24/48/96/192
+// TODO: Create separate WriteVIRem to more closely match the latencies
foreach mx = SchedMxList in {
foreach sew = SchedSEWSet<mx>.val in {
defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
- defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [SMX60_VIEU], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [SMX60_VIEU], mx, sew, IsWorstCase>;
+ // Slightly reduced for fractional LMULs
+ defvar Multiplier = !cond(
+ !eq(mx, "MF8") : 12,
+ !eq(mx, "MF4") : 12,
+ !eq(mx, "MF2") : 12,
+ true: 24
+ );
+
+ let Latency = !mul(Get1248Latency<mx>.c, Multiplier), ReleaseAtCycles = [12] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [SMX60_VIEU], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [SMX60_VIEU], mx, sew, IsWorstCase>;
+ }
}
}
@@ -381,12 +480,21 @@ foreach mx = SchedMxList in {
foreach mx = SchedMxListW in {
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c;
- defm "" : LMULWriteResMX<"WriteVNShiftV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVNShiftX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVNShiftI", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVNClipV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVNClipX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVNClipI", [SMX60_VIEU], mx, IsWorstCase>;
+ // Slightly increased for integer LMULs
+ defvar Multiplier = !cond(
+ !eq(mx, "M2") : 2,
+ !eq(mx, "M4") : 2,
+ true: 1
+ );
+
+ let Latency = !mul(Get4816Latency<mx>.c, Multiplier), ReleaseAtCycles = [4] in {
+ defm "" : LMULWriteResMX<"WriteVNShiftV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVNShiftX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVNShiftI", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVNClipV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVNClipX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVNClipI", [SMX60_VIEU], mx, IsWorstCase>;
+ }
}
// 12. Vector Fixed-Point Arithmetic Instructions
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index b43b915..da6ac2f 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -104,11 +104,6 @@ static cl::opt<bool> EnablePostMISchedLoadStoreClustering(
cl::desc("Enable PostRA load and store clustering in the machine scheduler"),
cl::init(true));
-static cl::opt<bool>
- EnableVLOptimizer("riscv-enable-vl-optimizer",
- cl::desc("Enable the RISC-V VL Optimizer pass"),
- cl::init(true), cl::Hidden);
-
static cl::opt<bool> DisableVectorMaskMutation(
"riscv-disable-vector-mask-mutation",
cl::desc("Disable the vector mask scheduling mutation"), cl::init(false),
@@ -617,8 +612,7 @@ void RISCVPassConfig::addPreRegAlloc() {
addPass(createRISCVPreRAExpandPseudoPass());
if (TM->getOptLevel() != CodeGenOptLevel::None) {
addPass(createRISCVMergeBaseOffsetOptPass());
- if (EnableVLOptimizer)
- addPass(createRISCVVLOptimizerPass());
+ addPass(createRISCVVLOptimizerPass());
}
addPass(createRISCVInsertReadWriteCSRPass());
diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
index b53d919..c946451 100644
--- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
@@ -114,14 +114,6 @@ FunctionPass *llvm::createRISCVVLOptimizerPass() {
return new RISCVVLOptimizer();
}
-/// Return true if R is a physical or virtual vector register, false otherwise.
-static bool isVectorRegClass(Register R, const MachineRegisterInfo *MRI) {
- if (R.isPhysical())
- return RISCV::VRRegClass.contains(R);
- const TargetRegisterClass *RC = MRI->getRegClass(R);
- return RISCVRI::isVRegClass(RC->TSFlags);
-}
-
LLVM_ATTRIBUTE_UNUSED
static raw_ostream &operator<<(raw_ostream &OS, const OperandInfo &OI) {
OI.print(OS);
@@ -183,37 +175,28 @@ static unsigned getIntegerExtensionOperandEEW(unsigned Factor,
return Log2EEW;
}
-/// Check whether MO is a mask operand of MI.
-static bool isMaskOperand(const MachineInstr &MI, const MachineOperand &MO,
- const MachineRegisterInfo *MRI) {
-
- if (!MO.isReg() || !isVectorRegClass(MO.getReg(), MRI))
- return false;
-
- const MCInstrDesc &Desc = MI.getDesc();
- return Desc.operands()[MO.getOperandNo()].RegClass == RISCV::VMV0RegClassID;
-}
-
static std::optional<unsigned>
getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
const MachineInstr &MI = *MO.getParent();
+ const MCInstrDesc &Desc = MI.getDesc();
const RISCVVPseudosTable::PseudoInfo *RVV =
RISCVVPseudosTable::getPseudoInfo(MI.getOpcode());
assert(RVV && "Could not find MI in PseudoTable");
// MI has a SEW associated with it. The RVV specification defines
// the EEW of each operand and definition in relation to MI.SEW.
- unsigned MILog2SEW =
- MI.getOperand(RISCVII::getSEWOpNum(MI.getDesc())).getImm();
+ unsigned MILog2SEW = MI.getOperand(RISCVII::getSEWOpNum(Desc)).getImm();
- const bool HasPassthru = RISCVII::isFirstDefTiedToFirstUse(MI.getDesc());
- const bool IsTied = RISCVII::isTiedPseudo(MI.getDesc().TSFlags);
+ const bool HasPassthru = RISCVII::isFirstDefTiedToFirstUse(Desc);
+ const bool IsTied = RISCVII::isTiedPseudo(Desc.TSFlags);
bool IsMODef = MO.getOperandNo() == 0 ||
(HasPassthru && MO.getOperandNo() == MI.getNumExplicitDefs());
// All mask operands have EEW=1
- if (isMaskOperand(MI, MO, MRI))
+ const MCOperandInfo &Info = Desc.operands()[MO.getOperandNo()];
+ if (Info.OperandType == MCOI::OPERAND_REGISTER &&
+ Info.RegClass == RISCV::VMV0RegClassID)
return 0;
// switch against BaseInstr to reduce number of cases that need to be
@@ -1296,8 +1279,8 @@ bool RISCVVLOptimizer::isCandidate(const MachineInstr &MI) const {
TII->get(RISCV::getRVVMCOpcode(MI.getOpcode())).TSFlags) &&
"Instruction shouldn't be supported if elements depend on VL");
- assert(MI.getOperand(0).isReg() &&
- isVectorRegClass(MI.getOperand(0).getReg(), MRI) &&
+ assert(RISCVRI::isVRegClass(
+ MRI->getRegClass(MI.getOperand(0).getReg())->TSFlags) &&
"All supported instructions produce a vector register result");
LLVM_DEBUG(dbgs() << "Found a candidate for VL reduction: " << MI << "\n");
diff --git a/llvm/lib/Target/SPIRV/SPIRVAPI.cpp b/llvm/lib/Target/SPIRV/SPIRVAPI.cpp
index bbf1d87..cfe7ef4 100644
--- a/llvm/lib/Target/SPIRV/SPIRVAPI.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVAPI.cpp
@@ -116,8 +116,8 @@ SPIRVTranslate(Module *M, std::string &SpirvObj, std::string &ErrMsg,
PM.add(new TargetLibraryInfoWrapperPass(TLII));
std::unique_ptr<MachineModuleInfoWrapperPass> MMIWP(
new MachineModuleInfoWrapperPass(Target.get()));
- const_cast<TargetLoweringObjectFile *>(Target->getObjFileLowering())
- ->Initialize(MMIWP->getMMI().getContext(), *Target);
+ Target->getObjFileLowering()->Initialize(MMIWP->getMMI().getContext(),
+ *Target);
SmallString<4096> OutBuffer;
raw_svector_ostream OutStream(OutBuffer);
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index b90e1aa..3c631ce 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -665,10 +665,10 @@ Type *SPIRVEmitIntrinsics::deduceElementTypeHelper(
auto *HandleType = cast<TargetExtType>(II->getOperand(0)->getType());
if (HandleType->getTargetExtName() == "spirv.Image" ||
HandleType->getTargetExtName() == "spirv.SignedImage") {
- if (II->hasOneUse()) {
- auto *U = *II->users().begin();
+ for (User *U : II->users()) {
Ty = cast<Instruction>(U)->getAccessType();
- assert(Ty && "Unable to get type for resource pointer.");
+ if (Ty)
+ break;
}
} else if (HandleType->getTargetExtName() == "spirv.VulkanBuffer") {
// This call is supposed to index into an array
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
index 4a9c88b..a95c4ff 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
@@ -7,7 +7,6 @@
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/SparcFixupKinds.h"
-#include "MCTargetDesc/SparcMCAsmInfo.h"
#include "MCTargetDesc/SparcMCTargetDesc.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCELFObjectWriter.h"
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
index 1ee6e80..79da53e 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
@@ -13,10 +13,7 @@
#include "MCTargetDesc/SparcMCAsmInfo.h"
#include "llvm/BinaryFormat/ELF.h"
-#include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCObjectStreamer.h"
-#include "llvm/MC/MCValue.h"
using namespace llvm;
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.cpp
index 3ef6030..72bb372 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.cpp
@@ -69,8 +69,8 @@ void SystemZHLASMAsmStreamer::EmitEOL() {
void SystemZHLASMAsmStreamer::changeSection(MCSection *Section,
uint32_t Subsection) {
- Section->printSwitchToSection(*MAI, getContext().getTargetTriple(), OS,
- Subsection);
+ MAI->printSwitchToSection(*Section, Subsection,
+ getContext().getTargetTriple(), OS);
MCStreamer::changeSection(Section, Subsection);
}
diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
index 19c9e9c..6ae69a4 100644
--- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
+++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
@@ -900,7 +900,8 @@ public:
bool checkDataSection() {
if (CurrentState != DataSection) {
- auto *WS = cast<MCSectionWasm>(getStreamer().getCurrentSectionOnly());
+ auto *WS = static_cast<const MCSectionWasm *>(
+ getStreamer().getCurrentSectionOnly());
if (WS && WS->isText())
return error("data directive must occur in a data segment: ",
Lexer.getTok());
@@ -1218,7 +1219,8 @@ public:
void doBeforeLabelEmit(MCSymbol *Symbol, SMLoc IDLoc) override {
// Code below only applies to labels in text sections.
- auto *CWS = cast<MCSectionWasm>(getStreamer().getCurrentSectionOnly());
+ auto *CWS = static_cast<const MCSectionWasm *>(
+ getStreamer().getCurrentSectionOnly());
if (!CWS->isText())
return;
diff --git a/llvm/lib/Target/WebAssembly/WebAssembly.td b/llvm/lib/Target/WebAssembly/WebAssembly.td
index 13603f8..a606209 100644
--- a/llvm/lib/Target/WebAssembly/WebAssembly.td
+++ b/llvm/lib/Target/WebAssembly/WebAssembly.td
@@ -71,6 +71,7 @@ def FeatureReferenceTypes :
SubtargetFeature<"reference-types", "HasReferenceTypes", "true",
"Enable reference types">;
+def FeatureGC : SubtargetFeature<"gc", "HasGC", "true", "Enable wasm gc">;
def FeatureRelaxedSIMD :
SubtargetFeature<"relaxed-simd", "SIMDLevel", "RelaxedSIMD",
"Enable relaxed-simd instructions">;
@@ -136,13 +137,13 @@ def : ProcessorModel<"lime1", NoSchedModel,
// Latest and greatest experimental version of WebAssembly. Bugs included!
def : ProcessorModel<"bleeding-edge", NoSchedModel,
- [FeatureAtomics, FeatureBulkMemory, FeatureBulkMemoryOpt,
- FeatureCallIndirectOverlong, FeatureExceptionHandling,
- FeatureExtendedConst, FeatureFP16, FeatureMultiMemory,
- FeatureMultivalue, FeatureMutableGlobals,
- FeatureNontrappingFPToInt, FeatureRelaxedSIMD,
- FeatureReferenceTypes, FeatureSIMD128, FeatureSignExt,
- FeatureTailCall]>;
+ [FeatureAtomics, FeatureBulkMemory, FeatureBulkMemoryOpt,
+ FeatureCallIndirectOverlong, FeatureExceptionHandling,
+ FeatureExtendedConst, FeatureFP16, FeatureMultiMemory,
+ FeatureMultivalue, FeatureMutableGlobals,
+ FeatureNontrappingFPToInt, FeatureRelaxedSIMD,
+ FeatureReferenceTypes, FeatureGC, FeatureSIMD128,
+ FeatureSignExt, FeatureTailCall]>;
//===----------------------------------------------------------------------===//
// Target Declaration
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
index fa5776c..b03b350 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
@@ -123,7 +123,7 @@ static SDValue getTagSymNode(int Tag, SelectionDAG *DAG) {
static APInt encodeFunctionSignature(SelectionDAG *DAG, SDLoc &DL,
SmallVector<MVT, 4> &Returns,
SmallVector<MVT, 4> &Params) {
- auto toWasmValType = [&](MVT VT) {
+ auto toWasmValType = [](MVT VT) {
if (VT == MVT::i32) {
return wasm::ValType::I32;
}
@@ -244,10 +244,18 @@ void WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
MCSymbol *Table = WebAssembly::getOrCreateFunctionTableSymbol(
MF.getContext(), Subtarget);
SDValue TableSym = CurDAG->getMCSymbol(Table, PtrVT);
- SDValue FuncRef = SDValue(
- CurDAG->getMachineNode(WebAssembly::TABLE_GET_FUNCREF, DL,
- MVT::funcref, TableSym, Node->getOperand(1)),
- 0);
+ SDValue FuncPtr = Node->getOperand(1);
+ if (Subtarget->hasAddr64() && FuncPtr.getValueType() == MVT::i64) {
+ // table.get expects an i32 but on 64 bit platforms the function pointer
+ // is an i64. In that case, i32.wrap_i64 to convert.
+ FuncPtr = SDValue(CurDAG->getMachineNode(WebAssembly::I32_WRAP_I64, DL,
+ MVT::i32, FuncPtr),
+ 0);
+ }
+ SDValue FuncRef =
+ SDValue(CurDAG->getMachineNode(WebAssembly::TABLE_GET_FUNCREF, DL,
+ MVT::funcref, TableSym, FuncPtr),
+ 0);
// Encode the signature information into the type index placeholder.
// This gets decoded and converted into the actual type signature in
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 11936a3..cd434f7 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -288,7 +288,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
// Expand float operations supported for scalars but not SIMD
for (auto Op : {ISD::FCOPYSIGN, ISD::FLOG, ISD::FLOG2, ISD::FLOG10,
- ISD::FEXP, ISD::FEXP2})
+ ISD::FEXP, ISD::FEXP2, ISD::FEXP10})
for (auto T : {MVT::v4f32, MVT::v2f64})
setOperationAction(Op, T, Expand);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index b5e723e..2b632fd 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -76,6 +76,9 @@ def HasReferenceTypes :
Predicate<"Subtarget->hasReferenceTypes()">,
AssemblerPredicate<(all_of FeatureReferenceTypes), "reference-types">;
+def HasGC : Predicate<"Subtarget->hasGC()">,
+ AssemblerPredicate<(all_of FeatureGC), "gc">;
+
def HasRelaxedSIMD :
Predicate<"Subtarget->hasRelaxedSIMD()">,
AssemblerPredicate<(all_of FeatureRelaxedSIMD), "relaxed-simd">;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td
index 40b87a0..fc82e5b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td
@@ -36,13 +36,10 @@ multiclass REF_I<WebAssemblyRegClass rc, ValueType vt, string ht> {
Requires<[HasReferenceTypes]>;
}
-defm REF_TEST_FUNCREF :
- I<(outs I32: $res),
- (ins TypeIndex:$type, FUNCREF: $ref),
- (outs),
- (ins TypeIndex:$type),
- [],
- "ref.test\t$type, $ref", "ref.test $type", 0xfb14>;
+defm REF_TEST_FUNCREF : I<(outs I32:$res), (ins TypeIndex:$type, FUNCREF:$ref),
+ (outs), (ins TypeIndex:$type), [],
+ "ref.test\t$type, $ref", "ref.test $type", 0xfb14>,
+ Requires<[HasGC]>;
defm "" : REF_I<FUNCREF, funcref, "func">;
defm "" : REF_I<EXTERNREF, externref, "extern">;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
index 7912aeb..ffd135d 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
@@ -63,8 +63,10 @@ void OptimizeReturned::visitCallBase(CallBase &CB) {
if (isa<Constant>(Arg))
continue;
// Like replaceDominatedUsesWith but using Instruction/Use dominance.
- Arg->replaceUsesWithIf(&CB,
- [&](Use &U) { return DT->dominates(&CB, U); });
+ Arg->replaceUsesWithIf(&CB, [&](Use &U) {
+ auto *I = cast<Instruction>(U.getUser());
+ return !I->isLifetimeStartOrEnd() && DT->dominates(&CB, U);
+ });
}
}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
index 40ea48a..a3ce40f 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
@@ -43,6 +43,11 @@ WebAssemblySubtarget::initializeSubtargetDependencies(StringRef CPU,
Bits.set(WebAssembly::FeatureBulkMemoryOpt);
}
+ // gc implies reference-types
+ if (HasGC) {
+ HasReferenceTypes = true;
+ }
+
// reference-types implies call-indirect-overlong
if (HasReferenceTypes) {
HasCallIndirectOverlong = true;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
index 591ce256..f814274 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
@@ -51,6 +51,7 @@ class WebAssemblySubtarget final : public WebAssemblyGenSubtargetInfo {
bool HasMutableGlobals = false;
bool HasNontrappingFPToInt = false;
bool HasReferenceTypes = false;
+ bool HasGC = false;
bool HasSignExt = false;
bool HasTailCall = false;
bool HasWideArithmetic = false;
@@ -107,6 +108,7 @@ public:
bool hasMutableGlobals() const { return HasMutableGlobals; }
bool hasNontrappingFPToInt() const { return HasNontrappingFPToInt; }
bool hasReferenceTypes() const { return HasReferenceTypes; }
+ bool hasGC() const { return HasGC; }
bool hasRelaxedSIMD() const { return SIMDLevel >= RelaxedSIMD; }
bool hasSignExt() const { return HasSignExt; }
bool hasSIMD128() const { return SIMDLevel >= SIMD128; }
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index b642c1c..d7671ed 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -1042,8 +1042,8 @@ private:
}
PrevState = CurrState;
}
- void onRParen() {
- PrevState = State;
+ bool onRParen(StringRef &ErrMsg) {
+ IntelExprState CurrState = State;
switch (State) {
default:
State = IES_ERROR;
@@ -1054,9 +1054,27 @@ private:
case IES_RBRAC:
case IES_RPAREN:
State = IES_RPAREN;
+ // In the case of a multiply, onRegister has already set IndexReg
+ // directly, with appropriate scale.
+ // Otherwise if we just saw a register it has only been stored in
+ // TmpReg, so we need to store it into the state machine.
+ if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
+ // If we already have a BaseReg, then assume this is the IndexReg with
+ // no explicit scale.
+ if (!BaseReg) {
+ BaseReg = TmpReg;
+ } else {
+ if (IndexReg)
+ return regsUseUpError(ErrMsg);
+ IndexReg = TmpReg;
+ Scale = 0;
+ }
+ }
IC.pushOperator(IC_RPAREN);
break;
}
+ PrevState = CurrState;
+ return false;
}
bool onOffset(const MCExpr *Val, SMLoc OffsetLoc, StringRef ID,
const InlineAsmIdentifierInfo &IDInfo,
@@ -2172,7 +2190,11 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
}
break;
case AsmToken::LParen: SM.onLParen(); break;
- case AsmToken::RParen: SM.onRParen(); break;
+ case AsmToken::RParen:
+ if (SM.onRParen(ErrMsg)) {
+ return Error(Tok.getLoc(), ErrMsg);
+ }
+ break;
}
if (SM.hadError())
return Error(Tok.getLoc(), "unknown token in expression");
@@ -4781,7 +4803,7 @@ bool X86AsmParser::parseDirectiveEven(SMLoc L) {
getStreamer().initSections(false, getSTI());
Section = getStreamer().getCurrentSectionOnly();
}
- if (Section->useCodeAlign())
+ if (getContext().getAsmInfo()->useCodeAlign(*Section))
getStreamer().emitCodeAlignment(Align(2), &getSTI(), 0);
else
getStreamer().emitValueToAlignment(Align(2), 0, 1, 0);
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index e213923..7f9d474 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -388,36 +388,6 @@ static bool mayHaveInterruptDelaySlot(unsigned InstOpcode) {
return false;
}
-/// Check if the instruction to be emitted is right after any data.
-static bool
-isRightAfterData(MCFragment *CurrentFragment,
- const std::pair<MCFragment *, size_t> &PrevInstPosition) {
- MCFragment *F = CurrentFragment;
- // Since data is always emitted into a DataFragment, our check strategy is
- // simple here.
- // - If the fragment is a DataFragment
- // - If it's empty (section start or data after align), return false.
- // - If it's not the fragment where the previous instruction is,
- // returns true.
- // - If it's the fragment holding the previous instruction but its
- // size changed since the previous instruction was emitted into
- // it, returns true.
- // - Otherwise returns false.
- // - If the fragment is not a DataFragment, returns false.
- if (F->getKind() == MCFragment::FT_Data)
- return F->getFixedSize() && (F != PrevInstPosition.first ||
- F->getFixedSize() != PrevInstPosition.second);
-
- return false;
-}
-
-/// \returns the fragment size if it has instructions, otherwise returns 0.
-static size_t getSizeForInstFragment(const MCFragment *F) {
- if (!F || !F->hasInstructions())
- return 0;
- return F->getSize();
-}
-
/// Return true if we can insert NOP or prefixes automatically before the
/// the instruction to be emitted.
bool X86AsmBackend::canPadInst(const MCInst &Inst, MCObjectStreamer &OS) const {
@@ -441,9 +411,11 @@ bool X86AsmBackend::canPadInst(const MCInst &Inst, MCObjectStreamer &OS) const {
// semantic.
return false;
- if (isRightAfterData(OS.getCurrentFragment(), PrevInstPosition))
- // If this instruction follows any data, there is no clear
- // instruction boundary, inserting a nop/prefix would change semantic.
+ // If this instruction follows any data, there is no clear instruction
+ // boundary, inserting a nop/prefix would change semantic.
+ auto Offset = OS.getCurFragSize();
+ if (Offset && (OS.getCurrentFragment() != PrevInstPosition.first ||
+ Offset != PrevInstPosition.second))
return false;
return true;
@@ -552,7 +524,7 @@ void X86AsmBackend::emitInstructionEnd(MCObjectStreamer &OS,
// Update PrevInstOpcode here, canPadInst() reads that.
MCFragment *CF = OS.getCurrentFragment();
PrevInstOpcode = Inst.getOpcode();
- PrevInstPosition = std::make_pair(CF, getSizeForInstFragment(CF));
+ PrevInstPosition = std::make_pair(CF, OS.getCurFragSize());
if (!canPadBranches(OS))
return;
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
index f5eeb3b..d691538 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -11,7 +11,6 @@
//===----------------------------------------------------------------------===//
#include "X86MCAsmInfo.h"
-#include "MCTargetDesc/X86MCExpr.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/Support/CommandLine.h"
diff --git a/llvm/lib/Target/X86/X86PassRegistry.def b/llvm/lib/Target/X86/X86PassRegistry.def
index 620526ff..3f2a433 100644
--- a/llvm/lib/Target/X86/X86PassRegistry.def
+++ b/llvm/lib/Target/X86/X86PassRegistry.def
@@ -12,8 +12,52 @@
// NOTE: NO INCLUDE GUARD DESIRED!
+#ifndef DUMMY_FUNCTION_PASS
+#define DUMMY_FUNCTION_PASS(NAME, CREATE_PASS)
+#endif
+DUMMY_FUNCTION_PASS("lower-amx-intrinsics", X86LowerAMXIntrinsics(*this))
+DUMMY_FUNCTION_PASS("lower-amx-type", X86LowerAMXTypePass(*this))
+DUMMY_FUNCTION_PASS("x86-partial-reduction", X86PartialReduction())
+DUMMY_FUNCTION_PASS("x86-winehstate", WinEHStatePass())
+#undef DUMMY_FUNCTION_PASS
+
#ifndef MACHINE_FUNCTION_PASS
#define MACHINE_FUNCTION_PASS(NAME, CREATE_PASS)
#endif
MACHINE_FUNCTION_PASS("x86-isel", X86ISelDAGToDAGPass(*this))
#undef MACHINE_FUNCTION_PASS
+
+#ifndef DUMMY_MACHINE_FUNCTION_PASS
+#define DUMMY_MACHINE_FUNCTION_PASS(NAME, PASS_NAME)
+#endif
+DUMMY_MACHINE_FUNCTION_PASS("x86-avoid-SFB", X86AvoidSFBPass())
+DUMMY_MACHINE_FUNCTION_PASS("x86-avoid-trailing-call", X86AvoidTrailingCallPass())
+DUMMY_MACHINE_FUNCTION_PASS("x86-cf-opt", X86CallFrameOptimization())
+DUMMY_MACHINE_FUNCTION_PASS("x86-cmov-conversion", X86CmovConverterPass())
+DUMMY_MACHINE_FUNCTION_PASS("x86-codege", FPS())
+DUMMY_MACHINE_FUNCTION_PASS("x86-compress-evex", CompressEVEXPass())
+DUMMY_MACHINE_FUNCTION_PASS("x86-domain-reassignment", X86DomainReassignment())
+DUMMY_MACHINE_FUNCTION_PASS("x86-dyn-alloca-expander", X86DynAllocaExpander())
+DUMMY_MACHINE_FUNCTION_PASS("x86-execution-domain-fix", X86ExecutionDomainFix())
+DUMMY_MACHINE_FUNCTION_PASS("fastpretileconfig", X86FastPreTileConfig())
+DUMMY_MACHINE_FUNCTION_PASS("fasttileconfig", X86FastTileConfig())
+DUMMY_MACHINE_FUNCTION_PASS("x86-fixup-LEAs", FixupLEAPass())
+DUMMY_MACHINE_FUNCTION_PASS("x86-fixup-bw-inst", FixupBWInstPass())
+DUMMY_MACHINE_FUNCTION_PASS("x86-fixup-inst-tuning", X86FixupInstTuningPass())
+DUMMY_MACHINE_FUNCTION_PASS("x86-fixup-setcc", X86FixupSetCCPass())
+DUMMY_MACHINE_FUNCTION_PASS("x86-fixup-vector-constants", X86FixupVectorConstantsPass())
+DUMMY_MACHINE_FUNCTION_PASS("x86-flags-copy-lowering", X86FlagsCopyLoweringPass())
+DUMMY_MACHINE_FUNCTION_PASS("x86-lower-tile-copy", X86LowerTileCopy())
+DUMMY_MACHINE_FUNCTION_PASS("x86-lvi-load", X86LoadValueInjectionLoadHardeningPass())
+DUMMY_MACHINE_FUNCTION_PASS("x86-lvi-ret", X86LoadValueInjectionRetHardeningPass())
+DUMMY_MACHINE_FUNCTION_PASS("x86-optimize-LEAs", X86OptimizeLEAPass())
+DUMMY_MACHINE_FUNCTION_PASS("x86-pseudo", X86ExpandPseudo())
+DUMMY_MACHINE_FUNCTION_PASS("x86-return-thunks", X86ReturnThunks())
+DUMMY_MACHINE_FUNCTION_PASS("x86-seses", X86SpeculativeExecutionSideEffectSuppression())
+DUMMY_MACHINE_FUNCTION_PASS("x86-slh", X86SpeculativeLoadHardeningPass())
+DUMMY_MACHINE_FUNCTION_PASS("x86-suppress-apx-for-relocation", X86SuppressAPXForRelocationPass())
+DUMMY_MACHINE_FUNCTION_PASS("tile-pre-config", X86PreTileConfig())
+DUMMY_MACHINE_FUNCTION_PASS("tileconfig", X86TileConfig())
+DUMMY_MACHINE_FUNCTION_PASS("x86-wineh-unwindv2", X86WinEHUnwindV2())
+DUMMY_MACHINE_FUNCTION_PASS("x86argumentstackrebase", X86ArgumentStackSlotPass())
+#undef DUMMY_MACHINE_FUNCTION_PASS
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 37a7b37..90791fc 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1838,14 +1838,15 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
return LT.first * *KindCost;
static const CostKindTblEntry AVX512BWShuffleTbl[] = {
- { TTI::SK_Broadcast, MVT::v32i16, { 1, 1, 1, 1 } }, // vpbroadcastw
- { TTI::SK_Broadcast, MVT::v32f16, { 1, 1, 1, 1 } }, // vpbroadcastw
- { TTI::SK_Broadcast, MVT::v64i8, { 1, 1, 1, 1 } }, // vpbroadcastb
+ { TTI::SK_Broadcast, MVT::v32i16, { 1, 3, 1, 1 } }, // vpbroadcastw
+ { TTI::SK_Broadcast, MVT::v32f16, { 1, 3, 1, 1 } }, // vpbroadcastw
+ { TTI::SK_Broadcast, MVT::v64i8, { 1, 3, 1, 1 } }, // vpbroadcastb
- { TTI::SK_Reverse, MVT::v32i16, { 2, 2, 2, 2 } }, // vpermw
- { TTI::SK_Reverse, MVT::v32f16, { 2, 2, 2, 2 } }, // vpermw
+ { TTI::SK_Reverse, MVT::v32i16, { 2, 6, 2, 4 } }, // vpermw
+ { TTI::SK_Reverse, MVT::v32f16, { 2, 6, 2, 4 } }, // vpermw
{ TTI::SK_Reverse, MVT::v16i16, { 2, 2, 2, 2 } }, // vpermw
- { TTI::SK_Reverse, MVT::v64i8, { 2, 2, 2, 2 } }, // pshufb + vshufi64x2
+ { TTI::SK_Reverse, MVT::v16f16, { 2, 2, 2, 2 } }, // vpermw
+ { TTI::SK_Reverse, MVT::v64i8, { 2, 9, 2, 3 } }, // pshufb + vshufi64x2
{ TTI::SK_PermuteSingleSrc, MVT::v32i16, { 2, 2, 2, 2 } }, // vpermw
{ TTI::SK_PermuteSingleSrc, MVT::v32f16, { 2, 2, 2, 2 } }, // vpermw
@@ -1874,18 +1875,25 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
return LT.first * *KindCost;
static const CostKindTblEntry AVX512ShuffleTbl[] = {
- {TTI::SK_Broadcast, MVT::v8f64, { 1, 1, 1, 1 } }, // vbroadcastsd
- {TTI::SK_Broadcast, MVT::v16f32, { 1, 1, 1, 1 } }, // vbroadcastss
- {TTI::SK_Broadcast, MVT::v8i64, { 1, 1, 1, 1 } }, // vpbroadcastq
- {TTI::SK_Broadcast, MVT::v16i32, { 1, 1, 1, 1 } }, // vpbroadcastd
- {TTI::SK_Broadcast, MVT::v32i16, { 1, 1, 1, 1 } }, // vpbroadcastw
- {TTI::SK_Broadcast, MVT::v32f16, { 1, 1, 1, 1 } }, // vpbroadcastw
- {TTI::SK_Broadcast, MVT::v64i8, { 1, 1, 1, 1 } }, // vpbroadcastb
-
- {TTI::SK_Reverse, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
- {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
- {TTI::SK_Reverse, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
- {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
+ {TTI::SK_Broadcast, MVT::v8f64, { 1, 3, 1, 1 } }, // vbroadcastsd
+ {TTI::SK_Broadcast, MVT::v4f64, { 1, 3, 1, 1 } }, // vbroadcastsd
+ {TTI::SK_Broadcast, MVT::v16f32, { 1, 3, 1, 1 } }, // vbroadcastss
+ {TTI::SK_Broadcast, MVT::v8f32, { 1, 3, 1, 1 } }, // vbroadcastss
+ {TTI::SK_Broadcast, MVT::v8i64, { 1, 3, 1, 1 } }, // vpbroadcastq
+ {TTI::SK_Broadcast, MVT::v4i64, { 1, 3, 1, 1 } }, // vpbroadcastq
+ {TTI::SK_Broadcast, MVT::v16i32, { 1, 3, 1, 1 } }, // vpbroadcastd
+ {TTI::SK_Broadcast, MVT::v8i32, { 1, 3, 1, 1 } }, // vpbroadcastd
+ {TTI::SK_Broadcast, MVT::v32i16, { 1, 3, 1, 1 } }, // vpbroadcastw
+ {TTI::SK_Broadcast, MVT::v16i16, { 1, 3, 1, 1 } }, // vpbroadcastw
+ {TTI::SK_Broadcast, MVT::v32f16, { 1, 3, 1, 1 } }, // vpbroadcastw
+ {TTI::SK_Broadcast, MVT::v16f16, { 1, 3, 1, 1 } }, // vpbroadcastw
+ {TTI::SK_Broadcast, MVT::v64i8, { 1, 3, 1, 1 } }, // vpbroadcastb
+ {TTI::SK_Broadcast, MVT::v32i8, { 1, 3, 1, 1 }}, // vpbroadcastb
+
+ {TTI::SK_Reverse, MVT::v8f64, { 1, 5, 2, 3 } }, // vpermpd
+ {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 2, 3 } }, // vpermps
+ {TTI::SK_Reverse, MVT::v8i64, { 1, 5, 2, 3 } }, // vpermq
+ {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 2, 3 } }, // vpermd
{TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca
{TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca
{TTI::SK_Reverse, MVT::v64i8, { 7, 7, 7, 7 } }, // per mca
@@ -1973,21 +1981,24 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
return LT.first * *KindCost;
static const CostKindTblEntry AVX2ShuffleTbl[] = {
- { TTI::SK_Broadcast, MVT::v4f64, { 1, 1, 1, 1 } }, // vbroadcastpd
- { TTI::SK_Broadcast, MVT::v8f32, { 1, 1, 1, 1 } }, // vbroadcastps
- { TTI::SK_Broadcast, MVT::v4i64, { 1, 1, 1, 1 } }, // vpbroadcastq
- { TTI::SK_Broadcast, MVT::v8i32, { 1, 1, 1, 1 } }, // vpbroadcastd
- { TTI::SK_Broadcast, MVT::v16i16, { 1, 1, 1, 1 } }, // vpbroadcastw
- { TTI::SK_Broadcast, MVT::v16f16, { 1, 1, 1, 1 } }, // vpbroadcastw
- { TTI::SK_Broadcast, MVT::v32i8, { 1, 1, 1, 1 } }, // vpbroadcastb
-
- { TTI::SK_Reverse, MVT::v4f64, { 1, 1, 1, 1 } }, // vpermpd
- { TTI::SK_Reverse, MVT::v8f32, { 1, 1, 1, 1 } }, // vpermps
- { TTI::SK_Reverse, MVT::v4i64, { 1, 1, 1, 1 } }, // vpermq
- { TTI::SK_Reverse, MVT::v8i32, { 1, 1, 1, 1 } }, // vpermd
- { TTI::SK_Reverse, MVT::v16i16, { 2, 2, 2, 2 } }, // vperm2i128 + pshufb
- { TTI::SK_Reverse, MVT::v16f16, { 2, 2, 2, 2 } }, // vperm2i128 + pshufb
- { TTI::SK_Reverse, MVT::v32i8, { 2, 2, 2, 2 } }, // vperm2i128 + pshufb
+ { TTI::SK_Broadcast, MVT::v4f64, { 1, 3, 1, 2 } }, // vbroadcastpd
+ { TTI::SK_Broadcast, MVT::v8f32, { 1, 3, 1, 2 } }, // vbroadcastps
+ { TTI::SK_Broadcast, MVT::v4i64, { 1, 3, 1, 2 } }, // vpbroadcastq
+ { TTI::SK_Broadcast, MVT::v8i32, { 1, 3, 1, 2 } }, // vpbroadcastd
+ { TTI::SK_Broadcast, MVT::v16i16, { 1, 3, 1, 2 } }, // vpbroadcastw
+ { TTI::SK_Broadcast, MVT::v8i16, { 1, 3, 1, 1 } }, // vpbroadcastw
+ { TTI::SK_Broadcast, MVT::v16f16, { 1, 3, 1, 2 } }, // vpbroadcastw
+ { TTI::SK_Broadcast, MVT::v8f16, { 1, 3, 1, 1 } }, // vpbroadcastw
+ { TTI::SK_Broadcast, MVT::v32i8, { 1, 3, 1, 2 } }, // vpbroadcastb
+ { TTI::SK_Broadcast, MVT::v16i8, { 1, 3, 1, 1 } }, // vpbroadcastb
+
+ { TTI::SK_Reverse, MVT::v4f64, { 1, 6, 1, 2 } }, // vpermpd
+ { TTI::SK_Reverse, MVT::v8f32, { 2, 7, 2, 4 } }, // vpermps
+ { TTI::SK_Reverse, MVT::v4i64, { 1, 6, 1, 2 } }, // vpermq
+ { TTI::SK_Reverse, MVT::v8i32, { 2, 7, 2, 4 } }, // vpermd
+ { TTI::SK_Reverse, MVT::v16i16, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb
+ { TTI::SK_Reverse, MVT::v16f16, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb
+ { TTI::SK_Reverse, MVT::v32i8, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb
{ TTI::SK_Select, MVT::v16i16, { 1, 1, 1, 1 } }, // vpblendvb
{ TTI::SK_Select, MVT::v16f16, { 1, 1, 1, 1 } }, // vpblendvb
@@ -2077,23 +2088,23 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
return LT.first * *KindCost;
static const CostKindTblEntry AVX1ShuffleTbl[] = {
- {TTI::SK_Broadcast, MVT::v4f64, {2,2,2,2}}, // vperm2f128 + vpermilpd
- {TTI::SK_Broadcast, MVT::v8f32, {2,2,2,2}}, // vperm2f128 + vpermilps
- {TTI::SK_Broadcast, MVT::v4i64, {2,2,2,2}}, // vperm2f128 + vpermilpd
- {TTI::SK_Broadcast, MVT::v8i32, {2,2,2,2}}, // vperm2f128 + vpermilps
- {TTI::SK_Broadcast, MVT::v16i16, {3,3,3,3}}, // vpshuflw + vpshufd + vinsertf128
- {TTI::SK_Broadcast, MVT::v16f16, {3,3,3,3}}, // vpshuflw + vpshufd + vinsertf128
- {TTI::SK_Broadcast, MVT::v32i8, {2,2,2,2}}, // vpshufb + vinsertf128
-
- {TTI::SK_Reverse, MVT::v4f64, {2,2,2,2}}, // vperm2f128 + vpermilpd
- {TTI::SK_Reverse, MVT::v8f32, {2,2,2,2}}, // vperm2f128 + vpermilps
- {TTI::SK_Reverse, MVT::v4i64, {2,2,2,2}}, // vperm2f128 + vpermilpd
- {TTI::SK_Reverse, MVT::v8i32, {2,2,2,2}}, // vperm2f128 + vpermilps
- {TTI::SK_Reverse, MVT::v16i16, {4,4,4,4}}, // vextractf128 + 2*pshufb
+ {TTI::SK_Broadcast, MVT::v4f64, {2,3,2,3}}, // vperm2f128 + vpermilpd
+ {TTI::SK_Broadcast, MVT::v8f32, {2,3,2,3}}, // vperm2f128 + vpermilps
+ {TTI::SK_Broadcast, MVT::v4i64, {2,3,2,3}}, // vperm2f128 + vpermilpd
+ {TTI::SK_Broadcast, MVT::v8i32, {2,3,2,3}}, // vperm2f128 + vpermilps
+ {TTI::SK_Broadcast, MVT::v16i16, {2,3,3,4}}, // vpshuflw + vpshufd + vinsertf128
+ {TTI::SK_Broadcast, MVT::v16f16, {2,3,3,4}}, // vpshuflw + vpshufd + vinsertf128
+ {TTI::SK_Broadcast, MVT::v32i8, {3,4,3,6}}, // vpshufb + vinsertf128
+
+ {TTI::SK_Reverse, MVT::v4f64, {2,6,2,2}}, // vperm2f128 + vpermilpd
+ {TTI::SK_Reverse, MVT::v8f32, {2,7,2,4}}, // vperm2f128 + vpermilps
+ {TTI::SK_Reverse, MVT::v4i64, {2,6,2,2}}, // vperm2f128 + vpermilpd
+ {TTI::SK_Reverse, MVT::v8i32, {2,7,2,4}}, // vperm2f128 + vpermilps
+ {TTI::SK_Reverse, MVT::v16i16, {2,9,5,5}}, // vextractf128 + 2*pshufb
// + vinsertf128
- {TTI::SK_Reverse, MVT::v16f16, {4,4,4,4}}, // vextractf128 + 2*pshufb
+ {TTI::SK_Reverse, MVT::v16f16, {2,9,5,5}}, // vextractf128 + 2*pshufb
// + vinsertf128
- {TTI::SK_Reverse, MVT::v32i8, {4,4,4,4}}, // vextractf128 + 2*pshufb
+ {TTI::SK_Reverse, MVT::v32i8, {2,9,5,5}}, // vextractf128 + 2*pshufb
// + vinsertf128
{TTI::SK_Select, MVT::v4i64, {1,1,1,1}}, // vblendpd
@@ -2156,13 +2167,13 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
return LT.first * *KindCost;
static const CostKindTblEntry SSSE3ShuffleTbl[] = {
- {TTI::SK_Broadcast, MVT::v8i16, {1, 1, 1, 1}}, // pshufb
- {TTI::SK_Broadcast, MVT::v8f16, {1, 1, 1, 1}}, // pshufb
- {TTI::SK_Broadcast, MVT::v16i8, {1, 1, 1, 1}}, // pshufb
+ {TTI::SK_Broadcast, MVT::v8i16, {1, 3, 2, 2}}, // pshufb
+ {TTI::SK_Broadcast, MVT::v8f16, {1, 3, 2, 2}}, // pshufb
+ {TTI::SK_Broadcast, MVT::v16i8, {1, 3, 2, 2}}, // pshufb
- {TTI::SK_Reverse, MVT::v8i16, {1, 1, 1, 1}}, // pshufb
- {TTI::SK_Reverse, MVT::v8f16, {1, 1, 1, 1}}, // pshufb
- {TTI::SK_Reverse, MVT::v16i8, {1, 1, 1, 1}}, // pshufb
+ {TTI::SK_Reverse, MVT::v8i16, {1, 2, 1, 2}}, // pshufb
+ {TTI::SK_Reverse, MVT::v8f16, {1, 2, 1, 2}}, // pshufb
+ {TTI::SK_Reverse, MVT::v16i8, {1, 2, 1, 2}}, // pshufb
{TTI::SK_Select, MVT::v8i16, {3, 3, 3, 3}}, // 2*pshufb + por
{TTI::SK_Select, MVT::v8f16, {3, 3, 3, 3}}, // 2*pshufb + por
@@ -2192,16 +2203,16 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{TTI::SK_Broadcast, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
{TTI::SK_Broadcast, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
{TTI::SK_Broadcast, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
- {TTI::SK_Broadcast, MVT::v8i16, {2, 2, 2, 2}}, // pshuflw + pshufd
- {TTI::SK_Broadcast, MVT::v8f16, {2, 2, 2, 2}}, // pshuflw + pshufd
- {TTI::SK_Broadcast, MVT::v16i8, {3, 3, 3, 3}}, // unpck + pshuflw + pshufd
+ {TTI::SK_Broadcast, MVT::v8i16, {1, 2, 2, 2}}, // pshuflw + pshufd
+ {TTI::SK_Broadcast, MVT::v8f16, {1, 2, 2, 2}}, // pshuflw + pshufd
+ {TTI::SK_Broadcast, MVT::v16i8, {2, 3, 3, 4}}, // unpck + pshuflw + pshufd
{TTI::SK_Reverse, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
{TTI::SK_Reverse, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
{TTI::SK_Reverse, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
- {TTI::SK_Reverse, MVT::v8i16, {3, 3, 3, 3}}, // pshuflw + pshufhw + pshufd
- {TTI::SK_Reverse, MVT::v8f16, {3, 3, 3, 3}}, // pshuflw + pshufhw + pshufd
- {TTI::SK_Reverse, MVT::v16i8, {9, 9, 9, 9}}, // 2*pshuflw + 2*pshufhw
+ {TTI::SK_Reverse, MVT::v8i16, {2, 3, 3, 3}}, // pshuflw + pshufhw + pshufd
+ {TTI::SK_Reverse, MVT::v8f16, {2, 3, 3, 3}}, // pshuflw + pshufhw + pshufd
+ {TTI::SK_Reverse, MVT::v16i8, {5, 6,11,11}}, // 2*pshuflw + 2*pshufhw
// + 2*pshufd + 2*unpck + packus
{TTI::SK_Select, MVT::v2i64, {1, 1, 1, 1}}, // movsd
diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp
index 4ca7444..e5c896f 100644
--- a/llvm/lib/TargetParser/TargetParser.cpp
+++ b/llvm/lib/TargetParser/TargetParser.cpp
@@ -451,6 +451,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
Features["permlane16-swap"] = true;
Features["ashr-pk-insts"] = true;
Features["atomic-buffer-pk-add-bf16-inst"] = true;
+ Features["vmem-pref-insts"] = true;
Features["atomic-fadd-rtn-insts"] = true;
Features["atomic-buffer-global-pk-add-f16-insts"] = true;
Features["atomic-flat-pk-add-16-insts"] = true;
diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp
index be51453..ee6651c 100644
--- a/llvm/lib/TargetParser/Triple.cpp
+++ b/llvm/lib/TargetParser/Triple.cpp
@@ -8,7 +8,6 @@
#include "llvm/TargetParser/Triple.h"
#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/Support/CodeGen.h"
diff --git a/llvm/lib/TextAPI/SymbolSet.cpp b/llvm/lib/TextAPI/SymbolSet.cpp
index 2e0b416..f21a061 100644
--- a/llvm/lib/TextAPI/SymbolSet.cpp
+++ b/llvm/lib/TextAPI/SymbolSet.cpp
@@ -11,6 +11,11 @@
using namespace llvm;
using namespace llvm::MachO;
+SymbolSet::~SymbolSet() {
+ for (auto &[Key, Sym] : Symbols)
+ Sym->~Symbol();
+}
+
Symbol *SymbolSet::addGlobalImpl(EncodeKind Kind, StringRef Name,
SymbolFlags Flags) {
Name = copyString(Name);
diff --git a/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
index 59ae057..ac93f748 100644
--- a/llvm/lib/Transforms/Coroutines/Coroutines.cpp
+++ b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
@@ -85,6 +85,9 @@ static Intrinsic::ID NonOverloadedCoroIntrinsics[] = {
Intrinsic::coro_id_async,
Intrinsic::coro_id_retcon,
Intrinsic::coro_id_retcon_once,
+ Intrinsic::coro_noop,
+ Intrinsic::coro_prepare_async,
+ Intrinsic::coro_prepare_retcon,
Intrinsic::coro_promise,
Intrinsic::coro_resume,
Intrinsic::coro_save,
diff --git a/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp b/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp
index 5a87cf8..b3910c4 100644
--- a/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp
+++ b/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp
@@ -48,6 +48,7 @@
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Module.h"
#include "llvm/Transforms/Utils/ModuleUtils.h"
@@ -63,7 +64,7 @@ static inline void eraseFromModule(T &ToErase) {
ToErase.eraseFromParent();
}
-static inline bool checkIfSupported(GlobalVariable &G) {
+static bool checkIfSupported(GlobalVariable &G) {
if (!G.isThreadLocal())
return true;
@@ -114,24 +115,221 @@ static inline void clearModule(Module &M) { // TODO: simplify.
eraseFromModule(*M.ifuncs().begin());
}
+static SmallVector<std::reference_wrapper<Use>>
+collectIndirectableUses(GlobalVariable *G) {
+ // We are interested only in use chains that end in an Instruction.
+ SmallVector<std::reference_wrapper<Use>> Uses;
+
+ SmallVector<std::reference_wrapper<Use>> Stack(G->use_begin(), G->use_end());
+ while (!Stack.empty()) {
+ Use &U = Stack.pop_back_val();
+ if (isa<Instruction>(U.getUser()))
+ Uses.emplace_back(U);
+ else
+ transform(U.getUser()->uses(), std::back_inserter(Stack),
+ [](auto &&U) { return std::ref(U); });
+ }
+
+ return Uses;
+}
+
+static inline GlobalVariable *getGlobalForName(GlobalVariable *G) {
+ // Create an anonymous global which stores the variable's name, which will be
+ // used by the HIPSTDPAR runtime to look up the program-wide symbol.
+ LLVMContext &Ctx = G->getContext();
+ auto *CDS = ConstantDataArray::getString(Ctx, G->getName());
+
+ GlobalVariable *N = G->getParent()->getOrInsertGlobal("", CDS->getType());
+ N->setInitializer(CDS);
+ N->setLinkage(GlobalValue::LinkageTypes::PrivateLinkage);
+ N->setConstant(true);
+
+ return N;
+}
+
+static inline GlobalVariable *getIndirectionGlobal(Module *M) {
+ // Create an anonymous global which stores a pointer to a pointer, which will
+ // be externally initialised by the HIPSTDPAR runtime with the address of the
+ // program-wide symbol.
+ Type *PtrTy = PointerType::get(
+ M->getContext(), M->getDataLayout().getDefaultGlobalsAddressSpace());
+ GlobalVariable *NewG = M->getOrInsertGlobal("", PtrTy);
+
+ NewG->setInitializer(PoisonValue::get(NewG->getValueType()));
+ NewG->setLinkage(GlobalValue::LinkageTypes::PrivateLinkage);
+ NewG->setConstant(true);
+ NewG->setExternallyInitialized(true);
+
+ return NewG;
+}
+
+static Constant *
+appendIndirectedGlobal(const GlobalVariable *IndirectionTable,
+ SmallVector<Constant *> &SymbolIndirections,
+ GlobalVariable *ToIndirect) {
+ Module *M = ToIndirect->getParent();
+
+ auto *InitTy = cast<StructType>(IndirectionTable->getValueType());
+ auto *SymbolListTy = cast<StructType>(InitTy->getStructElementType(2));
+ Type *NameTy = SymbolListTy->getElementType(0);
+ Type *IndirectTy = SymbolListTy->getElementType(1);
+
+ Constant *NameG = getGlobalForName(ToIndirect);
+ Constant *IndirectG = getIndirectionGlobal(M);
+ Constant *Entry = ConstantStruct::get(
+ SymbolListTy, {ConstantExpr::getAddrSpaceCast(NameG, NameTy),
+ ConstantExpr::getAddrSpaceCast(IndirectG, IndirectTy)});
+ SymbolIndirections.push_back(Entry);
+
+ return IndirectG;
+}
+
+static void fillIndirectionTable(GlobalVariable *IndirectionTable,
+ SmallVector<Constant *> Indirections) {
+ Module *M = IndirectionTable->getParent();
+ size_t SymCnt = Indirections.size();
+
+ auto *InitTy = cast<StructType>(IndirectionTable->getValueType());
+ Type *SymbolListTy = InitTy->getStructElementType(1);
+ auto *SymbolTy = cast<StructType>(InitTy->getStructElementType(2));
+
+ Constant *Count = ConstantInt::get(InitTy->getStructElementType(0), SymCnt);
+ M->removeGlobalVariable(IndirectionTable);
+ GlobalVariable *Symbols =
+ M->getOrInsertGlobal("", ArrayType::get(SymbolTy, SymCnt));
+ Symbols->setLinkage(GlobalValue::LinkageTypes::PrivateLinkage);
+ Symbols->setInitializer(
+ ConstantArray::get(ArrayType::get(SymbolTy, SymCnt), {Indirections}));
+ Symbols->setConstant(true);
+
+ Constant *ASCSymbols = ConstantExpr::getAddrSpaceCast(Symbols, SymbolListTy);
+ Constant *Init = ConstantStruct::get(
+ InitTy, {Count, ASCSymbols, PoisonValue::get(SymbolTy)});
+ M->insertGlobalVariable(IndirectionTable);
+ IndirectionTable->setInitializer(Init);
+}
+
+static void replaceWithIndirectUse(const Use &U, const GlobalVariable *G,
+ Constant *IndirectedG) {
+ auto *I = cast<Instruction>(U.getUser());
+
+ IRBuilder<> Builder(I);
+ unsigned OpIdx = U.getOperandNo();
+ Value *Op = I->getOperand(OpIdx);
+
+ // We walk back up the use chain, which could be an arbitrarily long sequence
+ // of constexpr AS casts, ptr-to-int and GEP instructions, until we reach the
+ // indirected global.
+ while (auto *CE = dyn_cast<ConstantExpr>(Op)) {
+ assert((CE->getOpcode() == Instruction::GetElementPtr ||
+ CE->getOpcode() == Instruction::AddrSpaceCast ||
+ CE->getOpcode() == Instruction::PtrToInt) &&
+ "Only GEP, ASCAST or PTRTOINT constant uses supported!");
+
+ Instruction *NewI = Builder.Insert(CE->getAsInstruction());
+ I->replaceUsesOfWith(Op, NewI);
+ I = NewI;
+ Op = I->getOperand(0);
+ OpIdx = 0;
+ Builder.SetInsertPoint(I);
+ }
+
+ assert(Op == G && "Must reach indirected global!");
+
+ I->setOperand(OpIdx, Builder.CreateLoad(G->getType(), IndirectedG));
+}
+
+static inline bool isValidIndirectionTable(GlobalVariable *IndirectionTable) {
+ std::string W;
+ raw_string_ostream OS(W);
+
+ Type *Ty = IndirectionTable->getValueType();
+ bool Valid = false;
+
+ if (!isa<StructType>(Ty)) {
+ OS << "The Indirection Table must be a struct type; ";
+ Ty->print(OS);
+ OS << " is incorrect.\n";
+ } else if (cast<StructType>(Ty)->getNumElements() != 3u) {
+ OS << "The Indirection Table must have 3 elements; "
+ << cast<StructType>(Ty)->getNumElements() << " is incorrect.\n";
+ } else if (!isa<IntegerType>(cast<StructType>(Ty)->getStructElementType(0))) {
+ OS << "The first element in the Indirection Table must be an integer; ";
+ cast<StructType>(Ty)->getStructElementType(0)->print(OS);
+ OS << " is incorrect.\n";
+ } else if (!isa<PointerType>(cast<StructType>(Ty)->getStructElementType(1))) {
+ OS << "The second element in the Indirection Table must be a pointer; ";
+ cast<StructType>(Ty)->getStructElementType(1)->print(OS);
+ OS << " is incorrect.\n";
+ } else if (!isa<StructType>(cast<StructType>(Ty)->getStructElementType(2))) {
+ OS << "The third element in the Indirection Table must be a struct type; ";
+ cast<StructType>(Ty)->getStructElementType(2)->print(OS);
+ OS << " is incorrect.\n";
+ } else {
+ Valid = true;
+ }
+
+ if (!Valid)
+ IndirectionTable->getContext().diagnose(DiagnosticInfoGeneric(W, DS_Error));
+
+ return Valid;
+}
+
+static void indirectGlobals(GlobalVariable *IndirectionTable,
+ SmallVector<GlobalVariable *> ToIndirect) {
+ // We replace globals with an indirected access via a pointer that will get
+ // set by the HIPSTDPAR runtime, using their accessible, program-wide unique
+ // address as set by the host linker-loader.
+ SmallVector<Constant *> SymbolIndirections;
+ for (auto &&G : ToIndirect) {
+ SmallVector<std::reference_wrapper<Use>> Uses = collectIndirectableUses(G);
+
+ if (Uses.empty())
+ continue;
+
+ Constant *IndirectedGlobal =
+ appendIndirectedGlobal(IndirectionTable, SymbolIndirections, G);
+
+ for_each(Uses,
+ [=](auto &&U) { replaceWithIndirectUse(U, G, IndirectedGlobal); });
+
+ eraseFromModule(*G);
+ }
+
+ if (SymbolIndirections.empty())
+ return;
+
+ fillIndirectionTable(IndirectionTable, std::move(SymbolIndirections));
+}
+
static inline void maybeHandleGlobals(Module &M) {
unsigned GlobAS = M.getDataLayout().getDefaultGlobalsAddressSpace();
- for (auto &&G : M.globals()) { // TODO: should we handle these in the FE?
+
+ SmallVector<GlobalVariable *> ToIndirect;
+ for (auto &&G : M.globals()) {
if (!checkIfSupported(G))
return clearModule(M);
-
- if (G.isThreadLocal())
- continue;
- if (G.isConstant())
- continue;
if (G.getAddressSpace() != GlobAS)
continue;
- if (G.getLinkage() != GlobalVariable::ExternalLinkage)
+ if (G.isConstant() && G.hasInitializer() && G.hasAtLeastLocalUnnamedAddr())
continue;
- G.setLinkage(GlobalVariable::ExternalWeakLinkage);
- G.setInitializer(nullptr);
- G.setExternallyInitialized(true);
+ ToIndirect.push_back(&G);
+ }
+
+ if (ToIndirect.empty())
+ return;
+
+ if (auto *IT = M.getNamedGlobal("__hipstdpar_symbol_indirection_table")) {
+ if (!isValidIndirectionTable(IT))
+ return clearModule(M);
+ return indirectGlobals(IT, std::move(ToIndirect));
+ } else {
+ for (auto &&G : ToIndirect) {
+ // We will internalise these, so we provide a poison initialiser.
+ if (!G->hasInitializer())
+ G->setInitializer(PoisonValue::get(G->getValueType()));
+ }
}
}
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index b803c97..0164fcd 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -2073,14 +2073,14 @@ std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
unsigned CloneNo) const {
auto VI = FSToVIMap.find(Func);
assert(VI != FSToVIMap.end());
+ std::string CallerName = getMemProfFuncName(VI->second.name(), CloneNo);
if (isa<AllocInfo *>(Call))
- return (VI->second.name() + " -> alloc").str();
+ return CallerName + " -> alloc";
else {
auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call);
- return (VI->second.name() + " -> " +
- getMemProfFuncName(Callsite->Callee.name(),
- Callsite->Clones[CloneNo]))
- .str();
+ return CallerName + " -> " +
+ getMemProfFuncName(Callsite->Callee.name(),
+ Callsite->Clones[CloneNo]);
}
}
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index d88bc2c..1b78ace 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1830,10 +1830,12 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
bool IntMinIsPoison = cast<Constant>(II->getArgOperand(1))->isOneValue();
// abs(-x) -> abs(x)
- // TODO: Copy nsw if it was present on the neg?
Value *X;
- if (match(IIOperand, m_Neg(m_Value(X))))
+ if (match(IIOperand, m_Neg(m_Value(X)))) {
+ if (cast<Instruction>(IIOperand)->hasNoSignedWrap() || IntMinIsPoison)
+ replaceOperand(*II, 1, Builder.getTrue());
return replaceOperand(*II, 0, X);
+ }
if (match(IIOperand, m_c_Select(m_Neg(m_Value(X)), m_Deferred(X))))
return replaceOperand(*II, 0, X);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index c5e1b04..da9b126 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -755,8 +755,7 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
// If the base pointers are different, but the indices are the same, just
// compare the base pointer.
- Value *PtrBase = GEPLHS->getOperand(0);
- if (PtrBase != GEPRHS->getOperand(0)) {
+ if (GEPLHS->getOperand(0) != GEPRHS->getOperand(0)) {
bool IndicesTheSame =
GEPLHS->getNumOperands() == GEPRHS->getNumOperands() &&
GEPLHS->getPointerOperand()->getType() ==
@@ -782,7 +781,7 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
if (GEPLHS->isInBounds() && GEPRHS->isInBounds() &&
(GEPLHS->hasAllConstantIndices() || GEPLHS->hasOneUse()) &&
(GEPRHS->hasAllConstantIndices() || GEPRHS->hasOneUse()) &&
- PtrBase->stripPointerCasts() ==
+ GEPLHS->getOperand(0)->stripPointerCasts() ==
GEPRHS->getOperand(0)->stripPointerCasts() &&
!GEPLHS->getType()->isVectorTy()) {
Value *LOffset = EmitGEPOffset(GEPLHS);
@@ -805,14 +804,10 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
LOffset, ROffset);
return replaceInstUsesWith(I, Cmp);
}
-
- // Otherwise, the base pointers are different and the indices are
- // different. Try convert this to an indexed compare by looking through
- // PHIs/casts.
- return transformToIndexedCompare(GEPLHS, RHS, Cond, DL, *this);
}
- if (GEPLHS->getNumOperands() == GEPRHS->getNumOperands() &&
+ if (GEPLHS->getOperand(0) == GEPRHS->getOperand(0) &&
+ GEPLHS->getNumOperands() == GEPRHS->getNumOperands() &&
GEPLHS->getSourceElementType() == GEPRHS->getSourceElementType()) {
// If the GEPs only differ by one index, compare it.
unsigned NumDifferences = 0; // Keep track of # differences.
@@ -849,11 +844,14 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
}
}
- if (CanFold(NW)) {
+ if (Base.Ptr && CanFold(Base.LHSNW & Base.RHSNW) && !Base.isExpensive()) {
// ((gep Ptr, OFFSET1) cmp (gep Ptr, OFFSET2) ---> (OFFSET1 cmp OFFSET2)
- Value *L = EmitGEPOffset(GEPLHS, /*RewriteGEP=*/true);
- Value *R = EmitGEPOffset(GEPRHS, /*RewriteGEP=*/true);
- return NewICmp(NW, L, R);
+ Type *IdxTy = DL.getIndexType(GEPLHS->getType());
+ Value *L =
+ EmitGEPOffsets(Base.LHSGEPs, Base.LHSNW, IdxTy, /*RewriteGEP=*/true);
+ Value *R =
+ EmitGEPOffsets(Base.RHSGEPs, Base.RHSNW, IdxTy, /*RewriteGEP=*/true);
+ return NewICmp(Base.LHSNW & Base.RHSNW, L, R);
}
}
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 2cc1bc9..0be1034 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -12,7 +12,6 @@
#include "InstCombineInternal.h"
#include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/SetOperations.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
@@ -1503,8 +1502,7 @@ Instruction *InstCombinerImpl::visitStoreInst(StoreInst &SI) {
// This is a non-terminator unreachable marker. Don't remove it.
if (isa<UndefValue>(Ptr)) {
// Remove guaranteed-to-transfer instructions before the marker.
- if (removeInstructionsBeforeUnreachable(SI))
- return &SI;
+ removeInstructionsBeforeUnreachable(SI);
// Remove all instructions after the marker and handle dead blocks this
// implies.
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 5849c3e..4e5a8d1 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -363,10 +363,10 @@ private:
void tagAlloca(IRBuilder<> &IRB, AllocaInst *AI, Value *Tag, size_t Size);
Value *tagPointer(IRBuilder<> &IRB, Type *Ty, Value *PtrLong, Value *Tag);
Value *untagPointer(IRBuilder<> &IRB, Value *PtrLong);
- bool instrumentStack(memtag::StackInfo &Info, Value *StackTag, Value *UARTag,
+ void instrumentStack(memtag::StackInfo &Info, Value *StackTag, Value *UARTag,
const DominatorTree &DT, const PostDominatorTree &PDT,
const LoopInfo &LI);
- bool instrumentLandingPads(SmallVectorImpl<Instruction *> &RetVec);
+ void instrumentLandingPads(SmallVectorImpl<Instruction *> &RetVec);
Value *getNextTagWithCall(IRBuilder<> &IRB);
Value *getStackBaseTag(IRBuilder<> &IRB);
Value *getAllocaTag(IRBuilder<> &IRB, Value *StackTag, unsigned AllocaNo);
@@ -1418,7 +1418,7 @@ void HWAddressSanitizer::emitPrologue(IRBuilder<> &IRB, bool WithFrameRecord) {
}
}
-bool HWAddressSanitizer::instrumentLandingPads(
+void HWAddressSanitizer::instrumentLandingPads(
SmallVectorImpl<Instruction *> &LandingPadVec) {
for (auto *LP : LandingPadVec) {
IRBuilder<> IRB(LP->getNextNode());
@@ -1427,10 +1427,9 @@ bool HWAddressSanitizer::instrumentLandingPads(
{memtag::readRegister(
IRB, (TargetTriple.getArch() == Triple::x86_64) ? "rsp" : "sp")});
}
- return true;
}
-bool HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo,
+void HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo,
Value *StackTag, Value *UARTag,
const DominatorTree &DT,
const PostDominatorTree &PDT,
@@ -1460,8 +1459,6 @@ bool HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo,
size_t Size = memtag::getAllocaSizeInBytes(*AI);
size_t AlignedSize = alignTo(Size, Mapping.getObjectAlignment());
- Value *AICast = IRB.CreatePointerCast(AI, PtrTy);
-
auto HandleLifetime = [&](IntrinsicInst *II) {
// Set the lifetime intrinsic to cover the whole alloca. This reduces the
// set of assumptions we need to make about the lifetime. Without this we
@@ -1474,14 +1471,13 @@ bool HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo,
// one set of start / end in any execution (i.e. the ends are not
// reachable from each other), so this will not cause any problems.
II->setArgOperand(0, ConstantInt::get(Int64Ty, AlignedSize));
- II->setArgOperand(1, AICast);
};
llvm::for_each(Info.LifetimeStart, HandleLifetime);
llvm::for_each(Info.LifetimeEnd, HandleLifetime);
- AI->replaceUsesWithIf(Replacement, [AICast, AILong](const Use &U) {
+ AI->replaceUsesWithIf(Replacement, [AILong](const Use &U) {
auto *User = U.getUser();
- return User != AILong && User != AICast && !isa<LifetimeIntrinsic>(User);
+ return User != AILong && !isa<LifetimeIntrinsic>(User);
});
memtag::annotateDebugRecords(Info, retagMask(N));
@@ -1524,7 +1520,6 @@ bool HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo,
}
memtag::alignAndPadAlloca(Info, Mapping.getObjectAlignment());
}
- return true;
}
static void emitRemark(const Function &F, OptimizationRemarkEmitter &ORE,
diff --git a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
index 854db0f..f451c2b 100644
--- a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
+++ b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
@@ -80,6 +80,27 @@ static cl::opt<unsigned>
ICPCSSkip("icp-csskip", cl::init(0), cl::Hidden,
cl::desc("Skip Callsite up to this number for this compilation"));
+// ICP the candidate function even when only a declaration is present.
+static cl::opt<bool> ICPAllowDecls(
+ "icp-allow-decls", cl::init(false), cl::Hidden,
+ cl::desc("Promote the target candidate even when the defintion "
+ " is not available"));
+
+// ICP hot candidate functions only. When setting to false, non-cold functions
+// (warm functions) can also be promoted.
+static cl::opt<bool>
+ ICPAllowHotOnly("icp-allow-hot-only", cl::init(true), cl::Hidden,
+ cl::desc("Promote the target candidate only if it is a "
+ "hot function. Otherwise, warm functions can "
+ "also be promoted"));
+
+// If one target cannot be ICP'd, proceed with the remaining targets instead
+// of exiting the callsite.
+static cl::opt<bool> ICPAllowCandidateSkip(
+ "icp-allow-candidate-skip", cl::init(false), cl::Hidden,
+ cl::desc("Continue with the remaining targets instead of exiting "
+ "when failing in a candidate"));
+
// Set if the pass is called in LTO optimization. The difference for LTO mode
// is the pass won't prefix the source module name to the internal linkage
// symbols.
@@ -330,6 +351,7 @@ private:
struct PromotionCandidate {
Function *const TargetFunction;
const uint64_t Count;
+ const uint32_t Index;
// The following fields only exists for promotion candidates with vtable
// information.
@@ -341,7 +363,8 @@ private:
VTableGUIDCountsMap VTableGUIDAndCounts;
SmallVector<Constant *> AddressPoints;
- PromotionCandidate(Function *F, uint64_t C) : TargetFunction(F), Count(C) {}
+ PromotionCandidate(Function *F, uint64_t C, uint32_t I)
+ : TargetFunction(F), Count(C), Index(I) {}
};
// Check if the indirect-call call site should be promoted. Return the number
@@ -356,12 +379,10 @@ private:
// Promote a list of targets for one indirect-call callsite by comparing
// indirect callee with functions. Return true if there are IR
// transformations and false otherwise.
- bool tryToPromoteWithFuncCmp(CallBase &CB, Instruction *VPtr,
- ArrayRef<PromotionCandidate> Candidates,
- uint64_t TotalCount,
- ArrayRef<InstrProfValueData> ICallProfDataRef,
- uint32_t NumCandidates,
- VTableGUIDCountsMap &VTableGUIDCounts);
+ bool tryToPromoteWithFuncCmp(
+ CallBase &CB, Instruction *VPtr, ArrayRef<PromotionCandidate> Candidates,
+ uint64_t TotalCount, MutableArrayRef<InstrProfValueData> ICallProfDataRef,
+ uint32_t NumCandidates, VTableGUIDCountsMap &VTableGUIDCounts);
// Promote a list of targets for one indirect call by comparing vtables with
// functions. Return true if there are IR transformations and false
@@ -394,12 +415,15 @@ private:
Constant *getOrCreateVTableAddressPointVar(GlobalVariable *GV,
uint64_t AddressPointOffset);
- void updateFuncValueProfiles(CallBase &CB, ArrayRef<InstrProfValueData> VDs,
+ void updateFuncValueProfiles(CallBase &CB,
+ MutableArrayRef<InstrProfValueData> VDs,
uint64_t Sum, uint32_t MaxMDCount);
void updateVPtrValueProfiles(Instruction *VPtr,
VTableGUIDCountsMap &VTableGUIDCounts);
+ bool isValidTarget(uint64_t, Function *, const CallBase &, uint64_t);
+
public:
IndirectCallPromoter(
Function &Func, Module &M, InstrProfSymtab *Symtab, bool SamplePGO,
@@ -419,6 +443,53 @@ public:
} // end anonymous namespace
+bool IndirectCallPromoter::isValidTarget(uint64_t Target,
+ Function *TargetFunction,
+ const CallBase &CB, uint64_t Count) {
+ // Don't promote if the symbol is not defined in the module. This avoids
+ // creating a reference to a symbol that doesn't exist in the module
+ // This can happen when we compile with a sample profile collected from
+ // one binary but used for another, which may have profiled targets that
+ // aren't used in the new binary. We might have a declaration initially in
+ // the case where the symbol is globally dead in the binary and removed by
+ // ThinLTO.
+ using namespace ore;
+ if (TargetFunction == nullptr) {
+ LLVM_DEBUG(dbgs() << " Not promote: Cannot find the target\n");
+ ORE.emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", &CB)
+ << "Cannot promote indirect call: target with md5sum "
+ << NV("target md5sum", Target)
+ << " not found (count=" << NV("Count", Count) << ")";
+ });
+ return false;
+ }
+ if (!ICPAllowDecls && TargetFunction->isDeclaration()) {
+ LLVM_DEBUG(dbgs() << " Not promote: target definition is not available\n");
+ ORE.emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "NoTargetDef", &CB)
+ << "Do not promote indirect call: target with md5sum "
+ << NV("target md5sum", Target)
+ << " definition not available (count=" << ore::NV("Count", Count)
+ << ")";
+ });
+ return false;
+ }
+
+ const char *Reason = nullptr;
+ if (!isLegalToPromote(CB, TargetFunction, &Reason)) {
+
+ ORE.emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", &CB)
+ << "Cannot promote indirect call to "
+ << NV("TargetFunction", TargetFunction)
+ << " (count=" << NV("Count", Count) << "): " << Reason;
+ });
+ return false;
+ }
+ return true;
+}
+
// Indirect-call promotion heuristic. The direct targets are sorted based on
// the count. Stop at the first target that is not promoted.
std::vector<IndirectCallPromoter::PromotionCandidate>
@@ -469,38 +540,15 @@ IndirectCallPromoter::getPromotionCandidatesForCallSite(
break;
}
- // Don't promote if the symbol is not defined in the module. This avoids
- // creating a reference to a symbol that doesn't exist in the module
- // This can happen when we compile with a sample profile collected from
- // one binary but used for another, which may have profiled targets that
- // aren't used in the new binary. We might have a declaration initially in
- // the case where the symbol is globally dead in the binary and removed by
- // ThinLTO.
Function *TargetFunction = Symtab->getFunction(Target);
- if (TargetFunction == nullptr || TargetFunction->isDeclaration()) {
- LLVM_DEBUG(dbgs() << " Not promote: Cannot find the target\n");
- ORE.emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", &CB)
- << "Cannot promote indirect call: target with md5sum "
- << ore::NV("target md5sum", Target) << " not found";
- });
- break;
- }
-
- const char *Reason = nullptr;
- if (!isLegalToPromote(CB, TargetFunction, &Reason)) {
- using namespace ore;
-
- ORE.emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", &CB)
- << "Cannot promote indirect call to "
- << NV("TargetFunction", TargetFunction) << " with count of "
- << NV("Count", Count) << ": " << Reason;
- });
- break;
+ if (!isValidTarget(Target, TargetFunction, CB, Count)) {
+ if (ICPAllowCandidateSkip)
+ continue;
+ else
+ break;
}
- Ret.push_back(PromotionCandidate(TargetFunction, Count));
+ Ret.push_back(PromotionCandidate(TargetFunction, Count, I));
TotalCount -= Count;
}
return Ret;
@@ -642,7 +690,7 @@ CallBase &llvm::pgo::promoteIndirectCall(CallBase &CB, Function *DirectCallee,
// Promote indirect-call to conditional direct-call for one callsite.
bool IndirectCallPromoter::tryToPromoteWithFuncCmp(
CallBase &CB, Instruction *VPtr, ArrayRef<PromotionCandidate> Candidates,
- uint64_t TotalCount, ArrayRef<InstrProfValueData> ICallProfDataRef,
+ uint64_t TotalCount, MutableArrayRef<InstrProfValueData> ICallProfDataRef,
uint32_t NumCandidates, VTableGUIDCountsMap &VTableGUIDCounts) {
uint32_t NumPromoted = 0;
@@ -655,6 +703,8 @@ bool IndirectCallPromoter::tryToPromoteWithFuncCmp(
NumOfPGOICallPromotion++;
NumPromoted++;
+ // Update the count and this entry will be erased later.
+ ICallProfDataRef[C.Index].Count = 0;
if (!EnableVTableProfileUse || C.VTableGUIDAndCounts.empty())
continue;
@@ -679,21 +729,33 @@ bool IndirectCallPromoter::tryToPromoteWithFuncCmp(
"Number of promoted functions should not be greater than the number "
"of values in profile metadata");
- // Update value profiles on the indirect call.
- updateFuncValueProfiles(CB, ICallProfDataRef.slice(NumPromoted), TotalCount,
- NumCandidates);
+ updateFuncValueProfiles(CB, ICallProfDataRef, TotalCount, NumCandidates);
updateVPtrValueProfiles(VPtr, VTableGUIDCounts);
return true;
}
void IndirectCallPromoter::updateFuncValueProfiles(
- CallBase &CB, ArrayRef<InstrProfValueData> CallVDs, uint64_t TotalCount,
- uint32_t MaxMDCount) {
+ CallBase &CB, MutableArrayRef<InstrProfValueData> CallVDs,
+ uint64_t TotalCount, uint32_t MaxMDCount) {
// First clear the existing !prof.
CB.setMetadata(LLVMContext::MD_prof, nullptr);
+
+ // Sort value profiles by count in descending order.
+ llvm::stable_sort(CallVDs, [](const InstrProfValueData &LHS,
+ const InstrProfValueData &RHS) {
+ return LHS.Count > RHS.Count;
+ });
+ // Drop the <target-value, count> pair if count is zero.
+ ArrayRef<InstrProfValueData> VDs(
+ CallVDs.begin(),
+ llvm::upper_bound(CallVDs, 0U,
+ [](uint64_t Count, const InstrProfValueData &ProfData) {
+ return ProfData.Count <= Count;
+ }));
+
// Annotate the remaining value profiles if counter is not zero.
if (TotalCount != 0)
- annotateValueSite(M, CB, CallVDs, TotalCount, IPVK_IndirectCallTarget,
+ annotateValueSite(M, CB, VDs, TotalCount, IPVK_IndirectCallTarget,
MaxMDCount);
}
@@ -726,7 +788,7 @@ bool IndirectCallPromoter::tryToPromoteWithVTableCmp(
uint64_t TotalFuncCount, uint32_t NumCandidates,
MutableArrayRef<InstrProfValueData> ICallProfDataRef,
VTableGUIDCountsMap &VTableGUIDCounts) {
- SmallVector<uint64_t, 4> PromotedFuncCount;
+ SmallVector<std::pair<uint32_t, uint64_t>, 4> PromotedFuncCount;
for (const auto &Candidate : Candidates) {
for (auto &[GUID, Count] : Candidate.VTableGUIDAndCounts)
@@ -771,7 +833,7 @@ bool IndirectCallPromoter::tryToPromoteWithVTableCmp(
return Remark;
});
- PromotedFuncCount.push_back(Candidate.Count);
+ PromotedFuncCount.push_back({Candidate.Index, Candidate.Count});
assert(TotalFuncCount >= Candidate.Count &&
"Within one prof metadata, total count is the sum of counts from "
@@ -792,22 +854,12 @@ bool IndirectCallPromoter::tryToPromoteWithVTableCmp(
// used to load multiple virtual functions. The vtable profiles needs to be
// updated properly in that case (e.g, for each indirect call annotate both
// type profiles and function profiles in one !prof).
- for (size_t I = 0; I < PromotedFuncCount.size(); I++)
- ICallProfDataRef[I].Count -=
- std::max(PromotedFuncCount[I], ICallProfDataRef[I].Count);
- // Sort value profiles by count in descending order.
- llvm::stable_sort(ICallProfDataRef, [](const InstrProfValueData &LHS,
- const InstrProfValueData &RHS) {
- return LHS.Count > RHS.Count;
- });
- // Drop the <target-value, count> pair if count is zero.
- ArrayRef<InstrProfValueData> VDs(
- ICallProfDataRef.begin(),
- llvm::upper_bound(ICallProfDataRef, 0U,
- [](uint64_t Count, const InstrProfValueData &ProfData) {
- return ProfData.Count <= Count;
- }));
- updateFuncValueProfiles(CB, VDs, TotalFuncCount, NumCandidates);
+ for (size_t I = 0; I < PromotedFuncCount.size(); I++) {
+ uint32_t Index = PromotedFuncCount[I].first;
+ ICallProfDataRef[Index].Count -=
+ std::max(PromotedFuncCount[I].second, ICallProfDataRef[Index].Count);
+ }
+ updateFuncValueProfiles(CB, ICallProfDataRef, TotalFuncCount, NumCandidates);
updateVPtrValueProfiles(VPtr, VTableGUIDCounts);
return true;
}
@@ -822,9 +874,22 @@ bool IndirectCallPromoter::processFunction(ProfileSummaryInfo *PSI) {
uint64_t TotalCount;
auto ICallProfDataRef = ICallAnalysis.getPromotionCandidatesForInstruction(
CB, TotalCount, NumCandidates);
- if (!NumCandidates ||
- (PSI && PSI->hasProfileSummary() && !PSI->isHotCount(TotalCount)))
+ if (!NumCandidates)
continue;
+ if (PSI && PSI->hasProfileSummary()) {
+ // Don't promote cold candidates.
+ if (PSI->isColdCount(TotalCount)) {
+ LLVM_DEBUG(dbgs() << "Don't promote the cold candidate: TotalCount="
+ << TotalCount << "\n");
+ continue;
+ }
+ // Only pormote hot if ICPAllowHotOnly is true.
+ if (ICPAllowHotOnly && !PSI->isHotCount(TotalCount)) {
+ LLVM_DEBUG(dbgs() << "Don't promote the non-hot candidate: TotalCount="
+ << TotalCount << "\n");
+ continue;
+ }
+ }
auto PromotionCandidates = getPromotionCandidatesForCallSite(
*CB, ICallProfDataRef, TotalCount, NumCandidates);
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 599b60d..df31f07 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -158,7 +158,6 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/bit.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/ValueTracking.h"
diff --git a/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h b/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
index 3fa844e..6135c7b 100644
--- a/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
+++ b/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
@@ -46,6 +46,8 @@ enum class ARCRuntimeEntryPointKind {
UnsafeClaimRV,
RetainAutorelease,
RetainAutoreleaseRV,
+ AutoreleasePoolPush,
+ AutoreleasePoolPop,
};
/// Declarations for ObjC runtime functions and constants. These are initialized
@@ -67,6 +69,8 @@ public:
UnsafeClaimRV = nullptr;
RetainAutorelease = nullptr;
RetainAutoreleaseRV = nullptr;
+ AutoreleasePoolPush = nullptr;
+ AutoreleasePoolPop = nullptr;
}
Function *get(ARCRuntimeEntryPointKind kind) {
@@ -101,6 +105,12 @@ public:
case ARCRuntimeEntryPointKind::RetainAutoreleaseRV:
return getIntrinsicEntryPoint(RetainAutoreleaseRV,
Intrinsic::objc_retainAutoreleaseReturnValue);
+ case ARCRuntimeEntryPointKind::AutoreleasePoolPush:
+ return getIntrinsicEntryPoint(AutoreleasePoolPush,
+ Intrinsic::objc_autoreleasePoolPush);
+ case ARCRuntimeEntryPointKind::AutoreleasePoolPop:
+ return getIntrinsicEntryPoint(AutoreleasePoolPop,
+ Intrinsic::objc_autoreleasePoolPop);
}
llvm_unreachable("Switch should be a covered switch.");
@@ -143,6 +153,12 @@ private:
/// Declaration for objc_retainAutoreleaseReturnValue().
Function *RetainAutoreleaseRV = nullptr;
+ /// Declaration for objc_autoreleasePoolPush().
+ Function *AutoreleasePoolPush = nullptr;
+
+ /// Declaration for objc_autoreleasePoolPop().
+ Function *AutoreleasePoolPop = nullptr;
+
Function *getIntrinsicEntryPoint(Function *&Decl, Intrinsic::ID IntID) {
if (Decl)
return Decl;
diff --git a/llvm/lib/Transforms/ObjCARC/CMakeLists.txt b/llvm/lib/Transforms/ObjCARC/CMakeLists.txt
index 80867db..4274667 100644
--- a/llvm/lib/Transforms/ObjCARC/CMakeLists.txt
+++ b/llvm/lib/Transforms/ObjCARC/CMakeLists.txt
@@ -2,7 +2,6 @@ add_llvm_component_library(LLVMObjCARCOpts
ObjCARC.cpp
ObjCARCOpts.cpp
ObjCARCExpand.cpp
- ObjCARCAPElim.cpp
ObjCARCContract.cpp
DependencyAnalysis.cpp
ProvenanceAnalysis.cpp
diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
deleted file mode 100644
index dceb2eb..0000000
--- a/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
+++ /dev/null
@@ -1,156 +0,0 @@
-//===- ObjCARCAPElim.cpp - ObjC ARC Optimization --------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-/// \file
-///
-/// This file defines ObjC ARC optimizations. ARC stands for Automatic
-/// Reference Counting and is a system for managing reference counts for objects
-/// in Objective C.
-///
-/// This specific file implements optimizations which remove extraneous
-/// autorelease pools.
-///
-/// WARNING: This file knows about certain library functions. It recognizes them
-/// by name, and hardwires knowledge of their semantics.
-///
-/// WARNING: This file knows about how certain Objective-C library functions are
-/// used. Naive LLVM IR transformations which would otherwise be
-/// behavior-preserving may break these assumptions.
-///
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Analysis/ObjCARCAnalysisUtils.h"
-#include "llvm/Analysis/ObjCARCInstKind.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/ObjCARC.h"
-
-using namespace llvm;
-using namespace llvm::objcarc;
-
-#define DEBUG_TYPE "objc-arc-ap-elim"
-
-namespace {
-
-/// Interprocedurally determine if calls made by the given call site can
-/// possibly produce autoreleases.
-bool MayAutorelease(const CallBase &CB, unsigned Depth = 0) {
- if (const Function *Callee = CB.getCalledFunction()) {
- if (!Callee->hasExactDefinition())
- return true;
- for (const BasicBlock &BB : *Callee) {
- for (const Instruction &I : BB)
- if (const CallBase *JCB = dyn_cast<CallBase>(&I))
- // This recursion depth limit is arbitrary. It's just great
- // enough to cover known interesting testcases.
- if (Depth < 3 && !JCB->onlyReadsMemory() &&
- MayAutorelease(*JCB, Depth + 1))
- return true;
- }
- return false;
- }
-
- return true;
-}
-
-bool OptimizeBB(BasicBlock *BB) {
- bool Changed = false;
-
- Instruction *Push = nullptr;
- for (Instruction &Inst : llvm::make_early_inc_range(*BB)) {
- switch (GetBasicARCInstKind(&Inst)) {
- case ARCInstKind::AutoreleasepoolPush:
- Push = &Inst;
- break;
- case ARCInstKind::AutoreleasepoolPop:
- // If this pop matches a push and nothing in between can autorelease,
- // zap the pair.
- if (Push && cast<CallInst>(&Inst)->getArgOperand(0) == Push) {
- Changed = true;
- LLVM_DEBUG(dbgs() << "ObjCARCAPElim::OptimizeBB: Zapping push pop "
- "autorelease pair:\n"
- " Pop: "
- << Inst << "\n"
- << " Push: " << *Push
- << "\n");
- Inst.eraseFromParent();
- Push->eraseFromParent();
- }
- Push = nullptr;
- break;
- case ARCInstKind::CallOrUser:
- if (MayAutorelease(cast<CallBase>(Inst)))
- Push = nullptr;
- break;
- default:
- break;
- }
- }
-
- return Changed;
-}
-
-bool runImpl(Module &M) {
- if (!EnableARCOpts)
- return false;
-
- // If nothing in the Module uses ARC, don't do anything.
- if (!ModuleHasARC(M))
- return false;
- // Find the llvm.global_ctors variable, as the first step in
- // identifying the global constructors. In theory, unnecessary autorelease
- // pools could occur anywhere, but in practice it's pretty rare. Global
- // ctors are a place where autorelease pools get inserted automatically,
- // so it's pretty common for them to be unnecessary, and it's pretty
- // profitable to eliminate them.
- GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors");
- if (!GV)
- return false;
-
- assert(GV->hasDefinitiveInitializer() &&
- "llvm.global_ctors is uncooperative!");
-
- bool Changed = false;
-
- // Dig the constructor functions out of GV's initializer.
- ConstantArray *Init = cast<ConstantArray>(GV->getInitializer());
- for (User::op_iterator OI = Init->op_begin(), OE = Init->op_end();
- OI != OE; ++OI) {
- Value *Op = *OI;
- // llvm.global_ctors is an array of three-field structs where the second
- // members are constructor functions.
- Function *F = dyn_cast<Function>(cast<ConstantStruct>(Op)->getOperand(1));
- // If the user used a constructor function with the wrong signature and
- // it got bitcasted or whatever, look the other way.
- if (!F)
- continue;
- // Only look at function definitions.
- if (F->isDeclaration())
- continue;
- // Only look at functions with one basic block.
- if (std::next(F->begin()) != F->end())
- continue;
- // Ok, a single-block constructor function definition. Try to optimize it.
- Changed |= OptimizeBB(&F->front());
- }
-
- return Changed;
-}
-
-} // namespace
-
-PreservedAnalyses ObjCARCAPElimPass::run(Module &M, ModuleAnalysisManager &AM) {
- if (!runImpl(M))
- return PreservedAnalyses::all();
- PreservedAnalyses PA;
- PA.preserveSet<CFGAnalyses>();
- return PA;
-}
diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
index 5eb3f51..66a2c76 100644
--- a/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
+++ b/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
@@ -39,6 +39,7 @@
#include "llvm/Analysis/ObjCARCAnalysisUtils.h"
#include "llvm/Analysis/ObjCARCInstKind.h"
#include "llvm/Analysis/ObjCARCUtil.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Constant.h"
@@ -132,11 +133,8 @@ static const Value *FindSingleUseIdentifiedObject(const Value *Arg) {
//
// The second retain and autorelease can be deleted.
-// TODO: It should be possible to delete
-// objc_autoreleasePoolPush and objc_autoreleasePoolPop
-// pairs if nothing is actually autoreleased between them. Also, autorelease
-// calls followed by objc_autoreleasePoolPop calls (perhaps in ObjC++ code
-// after inlining) can be turned into plain release calls.
+// TODO: Autorelease calls followed by objc_autoreleasePoolPop calls (perhaps in
+// ObjC++ code after inlining) can be turned into plain release calls.
// TODO: Critical-edge splitting. If the optimial insertion point is
// a critical edge, the current algorithm has to fail, because it doesn't
@@ -566,6 +564,8 @@ class ObjCARCOpt {
void OptimizeReturns(Function &F);
+ void OptimizeAutoreleasePools(Function &F);
+
template <typename PredicateT>
static void cloneOpBundlesIf(CallBase *CI,
SmallVectorImpl<OperandBundleDef> &OpBundles,
@@ -2473,6 +2473,11 @@ bool ObjCARCOpt::run(Function &F, AAResults &AA) {
(1 << unsigned(ARCInstKind::AutoreleaseRV))))
OptimizeReturns(F);
+ // Optimizations for autorelease pools.
+ if (UsedInThisFunction & ((1 << unsigned(ARCInstKind::AutoreleasepoolPush)) |
+ (1 << unsigned(ARCInstKind::AutoreleasepoolPop))))
+ OptimizeAutoreleasePools(F);
+
// Gather statistics after optimization.
#ifndef NDEBUG
if (AreStatisticsEnabled()) {
@@ -2485,6 +2490,183 @@ bool ObjCARCOpt::run(Function &F, AAResults &AA) {
return Changed;
}
+/// Interprocedurally determine if calls made by the given call site can
+/// possibly produce autoreleases.
+bool MayAutorelease(const CallBase &CB, unsigned Depth = 0) {
+ if (CB.onlyReadsMemory())
+ return false;
+
+ // This recursion depth limit is arbitrary. It's just great
+ // enough to cover known interesting testcases.
+ if (Depth > 5)
+ return true;
+
+ if (const Function *Callee = CB.getCalledFunction()) {
+ if (!Callee->hasExactDefinition())
+ return true;
+ for (const BasicBlock &BB : *Callee) {
+ for (const Instruction &I : BB) {
+ // TODO: Ignore all instructions between autorelease pools
+ ARCInstKind InstKind = GetBasicARCInstKind(&I);
+ switch (InstKind) {
+ case ARCInstKind::Autorelease:
+ case ARCInstKind::AutoreleaseRV:
+ case ARCInstKind::FusedRetainAutorelease:
+ case ARCInstKind::FusedRetainAutoreleaseRV:
+ case ARCInstKind::LoadWeak:
+ // These may produce autoreleases
+ return true;
+
+ case ARCInstKind::Retain:
+ case ARCInstKind::RetainRV:
+ case ARCInstKind::UnsafeClaimRV:
+ case ARCInstKind::RetainBlock:
+ case ARCInstKind::Release:
+ case ARCInstKind::NoopCast:
+ case ARCInstKind::LoadWeakRetained:
+ case ARCInstKind::StoreWeak:
+ case ARCInstKind::InitWeak:
+ case ARCInstKind::MoveWeak:
+ case ARCInstKind::CopyWeak:
+ case ARCInstKind::DestroyWeak:
+ case ARCInstKind::StoreStrong:
+ case ARCInstKind::AutoreleasepoolPush:
+ case ARCInstKind::AutoreleasepoolPop:
+ // These ObjC runtime functions don't produce autoreleases
+ break;
+
+ case ARCInstKind::CallOrUser:
+ case ARCInstKind::Call:
+ // For non-ObjC function calls, recursively analyze
+ if (MayAutorelease(cast<CallBase>(I), Depth + 1))
+ return true;
+ break;
+
+ case ARCInstKind::IntrinsicUser:
+ case ARCInstKind::User:
+ case ARCInstKind::None:
+ // These are not relevant for autorelease analysis
+ break;
+ }
+ }
+ }
+ return false;
+ }
+
+ return true;
+}
+
+/// Optimize autorelease pools by eliminating empty push/pop pairs.
+void ObjCARCOpt::OptimizeAutoreleasePools(Function &F) {
+ LLVM_DEBUG(dbgs() << "\n== ObjCARCOpt::OptimizeAutoreleasePools ==\n");
+
+ OptimizationRemarkEmitter ORE(&F);
+
+ // Process each basic block independently.
+ // TODO: Can we optimize inter-block autorelease pool pairs?
+ // This would involve tracking autorelease pool state across blocks.
+ for (BasicBlock &BB : F) {
+ // Use a stack to track nested autorelease pools
+ SmallVector<std::pair<CallInst *, bool>, 4>
+ PoolStack; // {push_inst, has_autorelease_in_scope}
+
+ for (Instruction &Inst : llvm::make_early_inc_range(BB)) {
+ ARCInstKind Class = GetBasicARCInstKind(&Inst);
+
+ switch (Class) {
+ case ARCInstKind::AutoreleasepoolPush: {
+ // Start tracking a new autorelease pool scope
+ auto *Push = cast<CallInst>(&Inst);
+ PoolStack.push_back(
+ {Push, false}); // {push_inst, has_autorelease_in_scope}
+ LLVM_DEBUG(dbgs() << "Found autorelease pool push: " << *Push << "\n");
+ break;
+ }
+
+ case ARCInstKind::AutoreleasepoolPop: {
+ auto *Pop = cast<CallInst>(&Inst);
+
+ if (PoolStack.empty())
+ break;
+
+ auto &TopPool = PoolStack.back();
+ CallInst *PendingPush = TopPool.first;
+ bool HasAutoreleaseInScope = TopPool.second;
+
+ // Pop the stack - remove this pool scope
+ PoolStack.pop_back();
+
+ // Bail if this pop doesn't match the pending push
+ if (Pop->getArgOperand(0)->stripPointerCasts() != PendingPush)
+ break;
+
+ // Bail if there were autoreleases in this scope
+ if (HasAutoreleaseInScope)
+ break;
+
+ // Optimize: eliminate this empty autorelease pool pair
+ ORE.emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "AutoreleasePoolElimination",
+ PendingPush)
+ << "eliminated empty autorelease pool pair";
+ });
+
+ // Replace all uses of push with poison before deletion
+ PendingPush->replaceAllUsesWith(
+ PoisonValue::get(PendingPush->getType()));
+
+ Pop->eraseFromParent();
+ PendingPush->eraseFromParent();
+
+ Changed = true;
+ ++NumNoops;
+ break;
+ }
+ case ARCInstKind::CallOrUser:
+ case ARCInstKind::Call:
+ if (!MayAutorelease(cast<CallBase>(Inst)))
+ break;
+ LLVM_FALLTHROUGH;
+ case ARCInstKind::Autorelease:
+ case ARCInstKind::AutoreleaseRV:
+ case ARCInstKind::FusedRetainAutorelease:
+ case ARCInstKind::FusedRetainAutoreleaseRV:
+ case ARCInstKind::LoadWeak: {
+ // Track that we have autorelease calls in the current pool scope
+ if (!PoolStack.empty()) {
+ PoolStack.back().second = true; // Set has_autorelease_in_scope = true
+ LLVM_DEBUG(
+ dbgs()
+ << "Found autorelease or potential autorelease in pool scope: "
+ << Inst << "\n");
+ }
+ break;
+ }
+
+ // Enumerate all remaining ARCInstKind cases explicitly
+ case ARCInstKind::Retain:
+ case ARCInstKind::RetainRV:
+ case ARCInstKind::UnsafeClaimRV:
+ case ARCInstKind::RetainBlock:
+ case ARCInstKind::Release:
+ case ARCInstKind::NoopCast:
+ case ARCInstKind::LoadWeakRetained:
+ case ARCInstKind::StoreWeak:
+ case ARCInstKind::InitWeak:
+ case ARCInstKind::MoveWeak:
+ case ARCInstKind::CopyWeak:
+ case ARCInstKind::DestroyWeak:
+ case ARCInstKind::StoreStrong:
+ case ARCInstKind::IntrinsicUser:
+ case ARCInstKind::User:
+ case ARCInstKind::None:
+ // These instruction kinds don't affect autorelease pool optimization
+ break;
+ }
+ }
+ }
+}
+
/// @}
///
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 8c84b0d..03b92d3 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -88,7 +88,6 @@
#include <cassert>
#include <cstdint>
#include <utility>
-#include <vector>
using namespace llvm;
using namespace SCEVPatternMatch;
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 70e9eee..08446cc 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -17,8 +17,8 @@
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringMap.h"
#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSet.h"
#include "llvm/Analysis/DependenceAnalysis.h"
#include "llvm/Analysis/LoopCacheAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
@@ -70,6 +70,13 @@ namespace {
using LoopVector = SmallVector<Loop *, 8>;
+/// A list of direction vectors. Each entry represents a direction vector
+/// corresponding to one or more dependencies existing in the loop nest. The
+/// length of all direction vectors is equal and is N + 1, where N is the depth
+/// of the loop nest. The first N elements correspond to the dependency
+/// direction of each N loops. The last one indicates whether this entry is
+/// forward dependency ('<') or not ('*'). The term "forward" aligns with what
+/// is defined in LoopAccessAnalysis.
// TODO: Check if we can use a sparse matrix here.
using CharMatrix = std::vector<std::vector<char>>;
@@ -126,11 +133,33 @@ static bool noDuplicateRulesAndIgnore(ArrayRef<RuleTy> Rules) {
static void printDepMatrix(CharMatrix &DepMatrix) {
for (auto &Row : DepMatrix) {
- for (auto D : Row)
+ // Drop the last element because it is a flag indicating whether this is
+ // forward dependency or not, which doesn't affect the legality check.
+ for (char D : drop_end(Row))
LLVM_DEBUG(dbgs() << D << " ");
LLVM_DEBUG(dbgs() << "\n");
}
}
+
+/// Return true if \p Src appears before \p Dst in the same basic block.
+/// Precondition: \p Src and \Dst are distinct instructions within the same
+/// basic block.
+static bool inThisOrder(const Instruction *Src, const Instruction *Dst) {
+ assert(Src->getParent() == Dst->getParent() && Src != Dst &&
+ "Expected Src and Dst to be different instructions in the same BB");
+
+ bool FoundSrc = false;
+ for (const Instruction &I : *(Src->getParent())) {
+ if (&I == Src) {
+ FoundSrc = true;
+ continue;
+ }
+ if (&I == Dst)
+ return FoundSrc;
+ }
+
+ llvm_unreachable("Dst not found");
+}
#endif
static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
@@ -174,7 +203,10 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
return false;
}
ValueVector::iterator I, IE, J, JE;
- StringSet<> Seen;
+
+ // Manage direction vectors that are already seen. Map each direction vector
+ // to an index of DepMatrix at which it is stored.
+ StringMap<unsigned> Seen;
for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
for (J = I, JE = MemInstr.end(); J != JE; ++J) {
@@ -228,9 +260,49 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
Dep.push_back('I');
}
+ // Test whether the dependency is forward or not.
+ bool IsKnownForward = true;
+ if (Src->getParent() != Dst->getParent()) {
+ // In general, when Src and Dst are in different BBs, the execution
+ // order of them within a single iteration is not guaranteed. Treat
+ // conservatively as not-forward dependency in this case.
+ IsKnownForward = false;
+ } else {
+ // Src and Dst are in the same BB. If they are the different
+ // instructions, Src should appear before Dst in the BB as they are
+ // stored to MemInstr in that order.
+ assert((Src == Dst || inThisOrder(Src, Dst)) &&
+ "Unexpected instructions");
+
+ // If the Dependence object is reversed (due to normalization), it
+ // represents the dependency from Dst to Src, meaning it is a backward
+ // dependency. Otherwise it should be a forward dependency.
+ bool IsReversed = D->getSrc() != Src;
+ if (IsReversed)
+ IsKnownForward = false;
+ }
+
+ // Initialize the last element. Assume forward dependencies only; it
+ // will be updated later if there is any non-forward dependency.
+ Dep.push_back('<');
+
+ // The last element should express the "summary" among one or more
+ // direction vectors whose first N elements are the same (where N is
+ // the depth of the loop nest). Hence we exclude the last element from
+ // the Seen map.
+ auto [Ite, Inserted] = Seen.try_emplace(
+ StringRef(Dep.data(), Dep.size() - 1), DepMatrix.size());
+
// Make sure we only add unique entries to the dependency matrix.
- if (Seen.insert(StringRef(Dep.data(), Dep.size())).second)
+ if (Inserted)
DepMatrix.push_back(Dep);
+
+ // If we cannot prove that this dependency is forward, change the last
+ // element of the corresponding entry. Since a `[... *]` dependency
+ // includes a `[... <]` dependency, we do not need to keep both and
+ // change the existing entry instead.
+ if (!IsKnownForward)
+ DepMatrix[Ite->second].back() = '*';
}
}
}
@@ -281,11 +353,12 @@ static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
continue;
// Check if the direction vector is lexicographically positive (or zero)
- // for both before/after exchanged.
- if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size()) == false)
+ // for both before/after exchanged. Ignore the last element because it
+ // doesn't affect the legality.
+ if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false)
return false;
std::swap(Cur[InnerLoopId], Cur[OuterLoopId]);
- if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size()) == false)
+ if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false)
return false;
}
return true;
@@ -1334,22 +1407,35 @@ LoopInterchangeProfitability::isProfitablePerInstrOrderCost() {
static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId) {
for (const auto &Dep : DepMatrix) {
char Dir = Dep[LoopId];
- if (Dir != 'I' && Dir != '=')
- return false;
+ char DepType = Dep.back();
+ assert((DepType == '<' || DepType == '*') &&
+ "Unexpected element in dependency vector");
+
+ // There are no loop-carried dependencies.
+ if (Dir == '=' || Dir == 'I')
+ continue;
+
+ // DepType being '<' means that this direction vector represents a forward
+ // dependency. In principle, a loop with '<' direction can be vectorized in
+ // this case.
+ if (Dir == '<' && DepType == '<')
+ continue;
+
+ // We cannot prove that the loop is vectorizable.
+ return false;
}
return true;
}
std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) {
- // If the outer loop is not loop independent it is not profitable to move
- // this to inner position, since doing so would not enable inner loop
- // parallelism.
+ // If the outer loop cannot be vectorized, it is not profitable to move this
+ // to inner position.
if (!canVectorize(DepMatrix, OuterLoopId))
return false;
- // If inner loop has dependence and outer loop is loop independent then it is
- // profitable to interchange to enable inner loop parallelism.
+ // If the inner loop cannot be vectorized but the outer loop can be, then it
+ // is profitable to interchange to enable inner loop parallelism.
if (!canVectorize(DepMatrix, InnerLoopId))
return true;
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 9e318b0..e3ef9d8 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -3785,7 +3785,7 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
// Ignore icmp instructions which are already being analyzed.
if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
unsigned OtherIdx = !U.getOperandNo();
- Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx));
+ Value *OtherOp = ICI->getOperand(OtherIdx);
if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L))
continue;
}
diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 84d1c0b..9220abb 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -1593,11 +1593,8 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
// since both llvm.lifetime.start and llvm.lifetime.end intrinsics
// practically fill all the bytes of the alloca with an undefined
// value, although conceptually marked as alive/dead.
- int64_t Size = cast<ConstantInt>(UI->getOperand(0))->getSExtValue();
- if (Size < 0 || Size == DestSize) {
- LifetimeMarkers.push_back(UI);
- continue;
- }
+ LifetimeMarkers.push_back(UI);
+ continue;
}
AAMetadataInstrs.insert(UI);
@@ -1614,9 +1611,8 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
return true;
};
- // Check that dest has no Mod/Ref, from the alloca to the Store, except full
- // size lifetime intrinsics. And collect modref inst for the reachability
- // check.
+ // Check that dest has no Mod/Ref, from the alloca to the Store. And collect
+ // modref inst for the reachability check.
ModRefInfo DestModRef = ModRefInfo::NoModRef;
MemoryLocation DestLoc(DestAlloca, LocationSize::precise(Size));
SmallVector<BasicBlock *, 8> ReachabilityWorklist;
diff --git a/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
index ced61cb..aae5d60 100644
--- a/llvm/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
@@ -458,8 +458,10 @@ bool ScalarizerVisitor::visit(Function &F) {
Instruction *I = &*II;
bool Done = InstVisitor::visit(I);
++II;
- if (Done && I->getType()->isVoidTy())
+ if (Done && I->getType()->isVoidTy()) {
I->eraseFromParent();
+ Scalarized = true;
+ }
}
}
return finish();
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 9b40fc0..f6959ca2 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -2144,9 +2144,23 @@ void visitDomSubTree(DominatorTree &DT, BasicBlock *BB, CallableT Callable) {
void postUnswitch(Loop &L, LPMUpdater &U, StringRef LoopName,
bool CurrentLoopValid, bool PartiallyInvariant,
bool InjectedCondition, ArrayRef<Loop *> NewLoops) {
- // If we did a non-trivial unswitch, we have added new (cloned) loops.
- if (!NewLoops.empty())
+ auto RecordLoopAsUnswitched = [&](Loop *TargetLoop, StringRef Tag,
+ StringRef DisableTag) {
+ auto &Ctx = TargetLoop->getHeader()->getContext();
+ MDNode *DisableMD = MDNode::get(Ctx, MDString::get(Ctx, DisableTag));
+ MDNode *NewLoopID = makePostTransformationMetadata(
+ Ctx, TargetLoop->getLoopID(), {Tag}, {DisableMD});
+ TargetLoop->setLoopID(NewLoopID);
+ };
+
+ // If we performed a non-trivial unswitch, we have added new cloned loops.
+ // Mark such newly-created loops as visited.
+ if (!NewLoops.empty()) {
+ for (Loop *NL : NewLoops)
+ RecordLoopAsUnswitched(NL, "llvm.loop.unswitch.nontrivial",
+ "llvm.loop.unswitch.nontrivial.disable");
U.addSiblingLoops(NewLoops);
+ }
// If the current loop remains valid, we should revisit it to catch any
// other unswitch opportunities. Otherwise, we need to mark it as deleted.
@@ -2154,24 +2168,12 @@ void postUnswitch(Loop &L, LPMUpdater &U, StringRef LoopName,
if (PartiallyInvariant) {
// Mark the new loop as partially unswitched, to avoid unswitching on
// the same condition again.
- auto &Context = L.getHeader()->getContext();
- MDNode *DisableUnswitchMD = MDNode::get(
- Context,
- MDString::get(Context, "llvm.loop.unswitch.partial.disable"));
- MDNode *NewLoopID = makePostTransformationMetadata(
- Context, L.getLoopID(), {"llvm.loop.unswitch.partial"},
- {DisableUnswitchMD});
- L.setLoopID(NewLoopID);
+ RecordLoopAsUnswitched(&L, "llvm.loop.unswitch.partial",
+ "llvm.loop.unswitch.partial.disable");
} else if (InjectedCondition) {
// Do the same for injection of invariant conditions.
- auto &Context = L.getHeader()->getContext();
- MDNode *DisableUnswitchMD = MDNode::get(
- Context,
- MDString::get(Context, "llvm.loop.unswitch.injection.disable"));
- MDNode *NewLoopID = makePostTransformationMetadata(
- Context, L.getLoopID(), {"llvm.loop.unswitch.injection"},
- {DisableUnswitchMD});
- L.setLoopID(NewLoopID);
+ RecordLoopAsUnswitched(&L, "llvm.loop.unswitch.injection",
+ "llvm.loop.unswitch.injection.disable");
} else
U.revisitCurrentLoop();
} else
@@ -2809,9 +2811,9 @@ static BranchInst *turnGuardIntoBranch(IntrinsicInst *GI, Loop &L,
}
/// Cost multiplier is a way to limit potentially exponential behavior
-/// of loop-unswitch. Cost is multipied in proportion of 2^number of unswitch
-/// candidates available. Also accounting for the number of "sibling" loops with
-/// the idea to account for previous unswitches that already happened on this
+/// of loop-unswitch. Cost is multiplied in proportion of 2^number of unswitch
+/// candidates available. Also consider the number of "sibling" loops with
+/// the idea of accounting for previous unswitches that already happened on this
/// cluster of loops. There was an attempt to keep this formula simple,
/// just enough to limit the worst case behavior. Even if it is not that simple
/// now it is still not an attempt to provide a detailed heuristic size
@@ -3507,8 +3509,9 @@ static bool unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI,
SmallVector<NonTrivialUnswitchCandidate, 4> UnswitchCandidates;
IVConditionInfo PartialIVInfo;
Instruction *PartialIVCondBranch = nullptr;
- collectUnswitchCandidates(UnswitchCandidates, PartialIVInfo,
- PartialIVCondBranch, L, LI, AA, MSSAU);
+ if (!findOptionMDForLoop(&L, "llvm.loop.unswitch.nontrivial.disable"))
+ collectUnswitchCandidates(UnswitchCandidates, PartialIVInfo,
+ PartialIVCondBranch, L, LI, AA, MSSAU);
if (!findOptionMDForLoop(&L, "llvm.loop.unswitch.injection.disable"))
collectUnswitchCandidatesWithInjections(UnswitchCandidates, PartialIVInfo,
PartialIVCondBranch, L, DT, LI, AA,
diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index a69d649..44e63a0 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -19,6 +19,7 @@
#include "llvm/Analysis/RegionInfo.h"
#include "llvm/Analysis/RegionIterator.h"
#include "llvm/Analysis/RegionPass.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
@@ -128,6 +129,7 @@ struct PredInfo {
using BBPredicates = DenseMap<BasicBlock *, PredInfo>;
using PredMap = DenseMap<BasicBlock *, BBPredicates>;
using BB2BBMap = DenseMap<BasicBlock *, BasicBlock *>;
+using Val2BBMap = DenseMap<Value *, BasicBlock *>;
// A traits type that is intended to be used in graph algorithms. The graph
// traits starts at an entry node, and traverses the RegionNodes that are in
@@ -279,7 +281,7 @@ class StructurizeCFG {
ConstantInt *BoolTrue;
ConstantInt *BoolFalse;
Value *BoolPoison;
-
+ const TargetTransformInfo *TTI;
Function *Func;
Region *ParentRegion;
@@ -301,8 +303,12 @@ class StructurizeCFG {
PredMap LoopPreds;
BranchVector LoopConds;
+ Val2BBMap HoistedValues;
+
RegionNode *PrevNode;
+ void hoistZeroCostElseBlockPhiValues(BasicBlock *ElseBB, BasicBlock *ThenBB);
+
void orderNodes();
void analyzeLoops(RegionNode *N);
@@ -332,6 +338,8 @@ class StructurizeCFG {
void simplifyAffectedPhis();
+ void simplifyHoistedPhis();
+
DebugLoc killTerminator(BasicBlock *BB);
void changeExit(RegionNode *Node, BasicBlock *NewExit,
@@ -359,7 +367,7 @@ class StructurizeCFG {
public:
void init(Region *R);
- bool run(Region *R, DominatorTree *DT);
+ bool run(Region *R, DominatorTree *DT, const TargetTransformInfo *TTI);
bool makeUniformRegion(Region *R, UniformityInfo &UA);
};
@@ -385,8 +393,11 @@ public:
if (SCFG.makeUniformRegion(R, UA))
return false;
}
+ Function *F = R->getEntry()->getParent();
+ const TargetTransformInfo *TTI =
+ &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(*F);
DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- return SCFG.run(R, DT);
+ return SCFG.run(R, DT, TTI);
}
StringRef getPassName() const override { return "Structurize control flow"; }
@@ -394,7 +405,9 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
if (SkipUniformRegions)
AU.addRequired<UniformityInfoWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
RegionPass::getAnalysisUsage(AU);
@@ -403,6 +416,34 @@ public:
} // end anonymous namespace
+/// Checks whether an instruction is zero cost instruction and checks if the
+/// operands are from different BB. If so, this instruction can be coalesced
+/// if its hoisted to predecessor block. So, this returns true.
+static bool isHoistableInstruction(Instruction *I, BasicBlock *BB,
+ const TargetTransformInfo *TTI) {
+ if (I->getParent() != BB || isa<PHINode>(I))
+ return false;
+
+ // If the instruction is not a zero cost instruction, return false.
+ auto Cost = TTI->getInstructionCost(I, TargetTransformInfo::TCK_Latency);
+ InstructionCost::CostType CostVal =
+ Cost.isValid()
+ ? Cost.getValue()
+ : (InstructionCost::CostType)TargetTransformInfo::TCC_Expensive;
+ if (CostVal != 0)
+ return false;
+
+ // Check if any operands are instructions defined in the same block.
+ for (auto &Op : I->operands()) {
+ if (auto *OpI = dyn_cast<Instruction>(Op)) {
+ if (OpI->getParent() == BB)
+ return false;
+ }
+ }
+
+ return true;
+}
+
char StructurizeCFGLegacyPass::ID = 0;
INITIALIZE_PASS_BEGIN(StructurizeCFGLegacyPass, "structurizecfg",
@@ -413,6 +454,39 @@ INITIALIZE_PASS_DEPENDENCY(RegionInfoPass)
INITIALIZE_PASS_END(StructurizeCFGLegacyPass, "structurizecfg",
"Structurize the CFG", false, false)
+/// Structurization can introduce unnecessary VGPR copies due to register
+/// coalescing interference. For example, if the Else block has a zero-cost
+/// instruction and the Then block modifies the VGPR value, only one value is
+/// live at a time in merge block before structurization. After structurization,
+/// the coalescer may incorrectly treat the Then value as live in the Else block
+/// (via the path Then → Flow → Else), leading to unnecessary VGPR copies.
+///
+/// This function examines phi nodes whose incoming values are zero-cost
+/// instructions in the Else block. It identifies such values that can be safely
+/// hoisted and moves them to the nearest common dominator of Then and Else
+/// blocks. A follow-up function after setting PhiNodes assigns the hoisted
+/// value to poison phi nodes along the if→flow edge, aiding register coalescing
+/// and minimizing unnecessary live ranges.
+void StructurizeCFG::hoistZeroCostElseBlockPhiValues(BasicBlock *ElseBB,
+ BasicBlock *ThenBB) {
+
+ BasicBlock *ElseSucc = ElseBB->getSingleSuccessor();
+ BasicBlock *CommonDominator = DT->findNearestCommonDominator(ElseBB, ThenBB);
+
+ if (!ElseSucc || !CommonDominator)
+ return;
+ Instruction *Term = CommonDominator->getTerminator();
+ for (PHINode &Phi : ElseSucc->phis()) {
+ Value *ElseVal = Phi.getIncomingValueForBlock(ElseBB);
+ auto *Inst = dyn_cast<Instruction>(ElseVal);
+ if (!Inst || !isHoistableInstruction(Inst, ElseBB, TTI))
+ continue;
+ Inst->removeFromParent();
+ Inst->insertInto(CommonDominator, Term->getIterator());
+ HoistedValues[Inst] = CommonDominator;
+ }
+}
+
/// Build up the general order of nodes, by performing a topological sort of the
/// parent region's nodes, while ensuring that there is no outer cycle node
/// between any two inner cycle nodes.
@@ -535,7 +609,7 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) {
BasicBlock *Other = Term->getSuccessor(!i);
if (Visited.count(Other) && !Loops.count(Other) &&
!Pred.count(Other) && !Pred.count(P)) {
-
+ hoistZeroCostElseBlockPhiValues(Succ, Other);
Pred[Other] = {BoolFalse, std::nullopt};
Pred[P] = {BoolTrue, std::nullopt};
continue;
@@ -891,6 +965,44 @@ void StructurizeCFG::setPhiValues() {
AffectedPhis.append(InsertedPhis.begin(), InsertedPhis.end());
}
+/// Updates PHI nodes after hoisted zero cost instructions by replacing poison
+/// entries on Flow nodes with the appropriate hoisted values
+void StructurizeCFG::simplifyHoistedPhis() {
+ for (WeakVH VH : AffectedPhis) {
+ PHINode *Phi = dyn_cast_or_null<PHINode>(VH);
+ if (!Phi || Phi->getNumIncomingValues() != 2)
+ continue;
+
+ for (int i = 0; i < 2; i++) {
+ Value *V = Phi->getIncomingValue(i);
+ auto BBIt = HoistedValues.find(V);
+
+ if (BBIt == HoistedValues.end())
+ continue;
+
+ Value *OtherV = Phi->getIncomingValue(!i);
+ PHINode *OtherPhi = dyn_cast<PHINode>(OtherV);
+ if (!OtherPhi)
+ continue;
+
+ int PoisonValBBIdx = -1;
+ for (size_t i = 0; i < OtherPhi->getNumIncomingValues(); i++) {
+ if (!isa<PoisonValue>(OtherPhi->getIncomingValue(i)))
+ continue;
+ PoisonValBBIdx = i;
+ break;
+ }
+ if (PoisonValBBIdx == -1 ||
+ !DT->dominates(BBIt->second,
+ OtherPhi->getIncomingBlock(PoisonValBBIdx)))
+ continue;
+
+ OtherPhi->setIncomingValue(PoisonValBBIdx, V);
+ Phi->setIncomingValue(i, OtherV);
+ }
+ }
+}
+
void StructurizeCFG::simplifyAffectedPhis() {
bool Changed;
do {
@@ -1283,12 +1395,13 @@ bool StructurizeCFG::makeUniformRegion(Region *R, UniformityInfo &UA) {
}
/// Run the transformation for each region found
-bool StructurizeCFG::run(Region *R, DominatorTree *DT) {
+bool StructurizeCFG::run(Region *R, DominatorTree *DT,
+ const TargetTransformInfo *TTI) {
if (R->isTopLevelRegion())
return false;
this->DT = DT;
-
+ this->TTI = TTI;
Func = R->getEntry()->getParent();
assert(hasOnlySimpleTerminator(*Func) && "Unsupported block terminator.");
@@ -1300,6 +1413,7 @@ bool StructurizeCFG::run(Region *R, DominatorTree *DT) {
insertConditions(false);
insertConditions(true);
setPhiValues();
+ simplifyHoistedPhis();
simplifyConditions();
simplifyAffectedPhis();
rebuildSSA();
@@ -1349,7 +1463,7 @@ PreservedAnalyses StructurizeCFGPass::run(Function &F,
bool Changed = false;
DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
auto &RI = AM.getResult<RegionInfoAnalysis>(F);
-
+ TargetTransformInfo *TTI = &AM.getResult<TargetIRAnalysis>(F);
UniformityInfo *UI = nullptr;
if (SkipUniformRegions)
UI = &AM.getResult<UniformityInfoAnalysis>(F);
@@ -1368,7 +1482,7 @@ PreservedAnalyses StructurizeCFGPass::run(Function &F,
continue;
}
- Changed |= SCFG.run(R, DT);
+ Changed |= SCFG.run(R, DT, TTI);
}
if (!Changed)
return PreservedAnalyses::all();
diff --git a/llvm/lib/Transforms/Utils/CMakeLists.txt b/llvm/lib/Transforms/Utils/CMakeLists.txt
index f7e66ec..a4fa0e2 100644
--- a/llvm/lib/Transforms/Utils/CMakeLists.txt
+++ b/llvm/lib/Transforms/Utils/CMakeLists.txt
@@ -68,6 +68,7 @@ add_llvm_component_library(LLVMTransformUtils
MoveAutoInit.cpp
NameAnonGlobals.cpp
PredicateInfo.cpp
+ ProfileVerify.cpp
PromoteMemoryToRegister.cpp
RelLookupTableConverter.cpp
ScalarEvolutionExpander.cpp
diff --git a/llvm/lib/Transforms/Utils/Debugify.cpp b/llvm/lib/Transforms/Utils/Debugify.cpp
index 4210ce6..291e2a5 100644
--- a/llvm/lib/Transforms/Utils/Debugify.cpp
+++ b/llvm/lib/Transforms/Utils/Debugify.cpp
@@ -22,7 +22,6 @@
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/PassInstrumentation.h"
#include "llvm/Pass.h"
diff --git a/llvm/lib/Transforms/Utils/LCSSA.cpp b/llvm/lib/Transforms/Utils/LCSSA.cpp
index 8d18c75..a9e08ad 100644
--- a/llvm/lib/Transforms/Utils/LCSSA.cpp
+++ b/llvm/lib/Transforms/Utils/LCSSA.cpp
@@ -41,7 +41,6 @@
#include "llvm/IR/DebugInfo.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/PredIteratorCache.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index f89d36f..babd7f6 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -482,16 +482,11 @@ bool llvm::wouldInstructionBeTriviallyDead(const Instruction *I,
if (II->isLifetimeStartOrEnd()) {
auto *Arg = II->getArgOperand(1);
- // Lifetime intrinsics are dead when their right-hand is undef.
- if (isa<UndefValue>(Arg))
- return true;
- // If the right-hand is an alloc, global, or argument and the only uses
- // are lifetime intrinsics then the intrinsics are dead.
- if (isa<AllocaInst>(Arg) || isa<GlobalValue>(Arg) || isa<Argument>(Arg))
- return llvm::all_of(Arg->uses(), [](Use &Use) {
- return isa<LifetimeIntrinsic>(Use.getUser());
- });
- return false;
+ // If the only uses of the alloca are lifetime intrinsics, then the
+ // intrinsics are dead.
+ return llvm::all_of(Arg->uses(), [](Use &Use) {
+ return isa<LifetimeIntrinsic>(Use.getUser());
+ });
}
// Assumptions are dead if their condition is trivially true.
diff --git a/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
index ac413c9..de9deab 100644
--- a/llvm/lib/Transforms/Utils/PredicateInfo.cpp
+++ b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
@@ -12,7 +12,6 @@
#include "llvm/Transforms/Utils/PredicateInfo.h"
#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/Analysis/AssumptionCache.h"
diff --git a/llvm/lib/Transforms/Utils/ProfileVerify.cpp b/llvm/lib/Transforms/Utils/ProfileVerify.cpp
new file mode 100644
index 0000000..b972132
--- /dev/null
+++ b/llvm/lib/Transforms/Utils/ProfileVerify.cpp
@@ -0,0 +1,129 @@
+//===- ProfileVerify.cpp - Verify profile info for testing ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/ProfileVerify.h"
+#include "llvm/ADT/DynamicAPInt.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/Analysis.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/ProfDataUtils.h"
+#include "llvm/Support/BranchProbability.h"
+
+using namespace llvm;
+namespace {
+class ProfileInjector {
+ Function &F;
+ FunctionAnalysisManager &FAM;
+
+public:
+ static const Instruction *
+ getTerminatorBenefitingFromMDProf(const BasicBlock &BB) {
+ if (succ_size(&BB) < 2)
+ return nullptr;
+ auto *Term = BB.getTerminator();
+ return (isa<BranchInst>(Term) || isa<SwitchInst>(Term) ||
+ isa<IndirectBrInst>(Term) || isa<CallBrInst>(Term))
+ ? Term
+ : nullptr;
+ }
+
+ static Instruction *getTerminatorBenefitingFromMDProf(BasicBlock &BB) {
+ return const_cast<Instruction *>(
+ getTerminatorBenefitingFromMDProf(const_cast<const BasicBlock &>(BB)));
+ }
+
+ ProfileInjector(Function &F, FunctionAnalysisManager &FAM) : F(F), FAM(FAM) {}
+ bool inject();
+};
+} // namespace
+
+// FIXME: currently this injects only for terminators. Select isn't yet
+// supported.
+bool ProfileInjector::inject() {
+ // Get whatever branch probability info can be derived from the given IR -
+ // whether it has or not metadata. The main intention for this pass is to
+ // ensure that other passes don't drop or "forget" to update MD_prof. We do
+ // this as a mode in which lit tests would run. We want to avoid changing the
+ // behavior of those tests. A pass may use BPI (or BFI, which is computed from
+ // BPI). If no metadata is present, BPI is guesstimated by
+ // BranchProbabilityAnalysis. The injector (this pass) only persists whatever
+ // information the analysis provides, in other words, the pass being tested
+ // will get the same BPI it does if the injector wasn't running.
+ auto &BPI = FAM.getResult<BranchProbabilityAnalysis>(F);
+
+ bool Changed = false;
+ for (auto &BB : F) {
+ auto *Term = getTerminatorBenefitingFromMDProf(BB);
+ if (!Term || Term->getMetadata(LLVMContext::MD_prof))
+ continue;
+ SmallVector<BranchProbability> Probs;
+ Probs.reserve(Term->getNumSuccessors());
+ for (auto I = 0U, E = Term->getNumSuccessors(); I < E; ++I)
+ Probs.emplace_back(BPI.getEdgeProbability(&BB, Term->getSuccessor(I)));
+
+ assert(llvm::find_if(Probs,
+ [](const BranchProbability &P) {
+ return P.isUnknown();
+ }) == Probs.end() &&
+ "All branch probabilities should be valid");
+ const auto *FirstZeroDenominator =
+ find_if(Probs, [](const BranchProbability &P) {
+ return P.getDenominator() == 0;
+ });
+ (void)FirstZeroDenominator;
+ assert(FirstZeroDenominator == Probs.end());
+ const auto *FirstNonZeroNumerator =
+ find_if(Probs, [](const BranchProbability &P) { return !P.isZero(); });
+ assert(FirstNonZeroNumerator != Probs.end());
+ DynamicAPInt LCM(Probs[0].getDenominator());
+ DynamicAPInt GCD(FirstNonZeroNumerator->getNumerator());
+ for (const auto &Prob : drop_begin(Probs)) {
+ if (!Prob.getNumerator())
+ continue;
+ LCM = llvm::lcm(LCM, DynamicAPInt(Prob.getDenominator()));
+ GCD = llvm::gcd(GCD, DynamicAPInt(Prob.getNumerator()));
+ }
+ SmallVector<uint32_t> Weights;
+ Weights.reserve(Term->getNumSuccessors());
+ for (const auto &Prob : Probs) {
+ DynamicAPInt W =
+ (Prob.getNumerator() * LCM / GCD) / Prob.getDenominator();
+ Weights.emplace_back(static_cast<uint32_t>((int64_t)W));
+ }
+ setBranchWeights(*Term, Weights, /*IsExpected=*/false);
+ Changed = true;
+ }
+ return Changed;
+}
+
+PreservedAnalyses ProfileInjectorPass::run(Function &F,
+ FunctionAnalysisManager &FAM) {
+ ProfileInjector PI(F, FAM);
+ if (!PI.inject())
+ return PreservedAnalyses::all();
+
+ return PreservedAnalyses::none();
+}
+
+PreservedAnalyses ProfileVerifierPass::run(Function &F,
+ FunctionAnalysisManager &FAM) {
+ for (const auto &BB : F)
+ if (const auto *Term =
+ ProfileInjector::getTerminatorBenefitingFromMDProf(BB))
+ if (!Term->getMetadata(LLVMContext::MD_prof))
+ F.getContext().emitError("Profile verification failed");
+
+ return PreservedAnalyses::none();
+}
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index ed08c0b..ddb062b 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -18,6 +18,7 @@
#include "llvm/ADT/SmallSet.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolutionPatternMatch.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/DataLayout.h"
@@ -42,6 +43,7 @@ cl::opt<unsigned> llvm::SCEVCheapExpansionBudget(
"controls the budget that is considered cheap (default = 4)"));
using namespace PatternMatch;
+using namespace SCEVPatternMatch;
PoisonFlags::PoisonFlags(const Instruction *I) {
NUW = false;
@@ -1224,6 +1226,7 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) {
}
Value *SCEVExpander::tryToReuseLCSSAPhi(const SCEVAddRecExpr *S) {
+ Type *STy = S->getType();
const Loop *L = S->getLoop();
BasicBlock *EB = L->getExitBlock();
if (!EB || !EB->getSinglePredecessor() ||
@@ -1231,11 +1234,36 @@ Value *SCEVExpander::tryToReuseLCSSAPhi(const SCEVAddRecExpr *S) {
return nullptr;
for (auto &PN : EB->phis()) {
- if (!SE.isSCEVable(PN.getType()) || PN.getType() != S->getType())
+ if (!SE.isSCEVable(PN.getType()))
continue;
- auto *ExitV = SE.getSCEV(&PN);
- if (S == ExitV)
- return &PN;
+ auto *ExitSCEV = SE.getSCEV(&PN);
+ if (!isa<SCEVAddRecExpr>(ExitSCEV))
+ continue;
+ Type *PhiTy = PN.getType();
+ if (STy->isIntegerTy() && PhiTy->isPointerTy())
+ ExitSCEV = SE.getPtrToIntExpr(ExitSCEV, STy);
+ else if (S->getType() != PN.getType())
+ continue;
+
+ // Check if we can re-use the existing PN, by adjusting it with an expanded
+ // offset, if the offset is simpler.
+ const SCEV *Diff = SE.getMinusSCEV(S, ExitSCEV);
+ const SCEV *Op = Diff;
+ match(Diff, m_scev_Mul(m_scev_AllOnes(), m_SCEV(Op)));
+ match(Op, m_scev_PtrToInt(m_SCEV(Op)));
+ if (!isa<SCEVConstant, SCEVUnknown>(Op))
+ continue;
+
+ assert(Diff->getType()->isIntegerTy() &&
+ "difference must be of integer type");
+ Value *DiffV = expand(Diff);
+ Value *BaseV = &PN;
+ if (PhiTy->isPointerTy()) {
+ if (STy->isPointerTy())
+ return Builder.CreatePtrAdd(BaseV, DiffV);
+ BaseV = Builder.CreatePtrToInt(BaseV, DiffV->getType());
+ }
+ return Builder.CreateAdd(BaseV, DiffV);
}
return nullptr;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 99a96a8..a53ccdd 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2021,6 +2021,9 @@ public:
/// Retrieves the MemCheckCond and MemCheckBlock that were generated as IR
/// outside VPlan.
std::pair<Value *, BasicBlock *> getMemRuntimeChecks() {
+ using namespace llvm::PatternMatch;
+ if (MemRuntimeCheckCond && match(MemRuntimeCheckCond, m_ZeroInt()))
+ return {nullptr, nullptr};
return {MemRuntimeCheckCond, MemCheckBlock};
}
@@ -7276,6 +7279,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
VPBasicBlock *VectorPH = cast<VPBasicBlock>(BestVPlan.getVectorPreheader());
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
VPlanTransforms::simplifyRecipes(BestVPlan, *Legal->getWidestInductionType());
+ VPlanTransforms::removeBranchOnConst(BestVPlan);
VPlanTransforms::narrowInterleaveGroups(
BestVPlan, BestVF,
TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector));
@@ -10095,9 +10099,20 @@ bool LoopVectorizePass::processLoop(Loop *L) {
unsigned SelectedIC = std::max(IC, UserIC);
// Optimistically generate runtime checks if they are needed. Drop them if
// they turn out to not be profitable.
- if (VF.Width.isVector() || SelectedIC > 1)
+ if (VF.Width.isVector() || SelectedIC > 1) {
Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
+ // Bail out early if either the SCEV or memory runtime checks are known to
+ // fail. In that case, the vector loop would never execute.
+ using namespace llvm::PatternMatch;
+ if (Checks.getSCEVChecks().first &&
+ match(Checks.getSCEVChecks().first, m_One()))
+ return false;
+ if (Checks.getMemRuntimeChecks().first &&
+ match(Checks.getMemRuntimeChecks().first, m_One()))
+ return false;
+ }
+
// Check if it is profitable to vectorize with runtime checks.
bool ForceVectorization =
Hints.getForce() == LoopVectorizeHints::FK_Enabled;
@@ -10228,6 +10243,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
L, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(1),
ElementCount::getFixed(1), IC, &CM, BFI, PSI, Checks, BestPlan);
+ // TODO: Move to general VPlan pipeline once epilogue loops are also
+ // supported.
+ VPlanTransforms::runPass(VPlanTransforms::materializeVectorTripCount,
+ BestPlan, VF.Width, IC, PSE);
+
LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
ORE->emit([&]() {
@@ -10295,6 +10315,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
VF.MinProfitableTripCount, IC, &CM, BFI, PSI,
Checks, BestPlan);
+ // TODO: Move to general VPlan pipeline once epilogue loops are also
+ // supported.
+ VPlanTransforms::runPass(VPlanTransforms::materializeVectorTripCount,
+ BestPlan, VF.Width, IC, PSE);
+
LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
++LoopsVectorized;
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 0d0b342..593868f 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -206,6 +206,12 @@ static cl::opt<bool> VectorizeNonPowerOf2(
"slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
cl::desc("Try to vectorize with non-power-of-2 number of elements."));
+/// Enables vectorization of copyable elements.
+static cl::opt<bool> VectorizeCopyableElements(
+ "slp-copyable-elements", cl::init(true), cl::Hidden,
+ cl::desc("Try to replace values with the idempotent instructions for "
+ "better vectorization."));
+
// Limit the number of alias checks. The limit is chosen so that
// it has no negative effect on the llvm benchmarks.
static const unsigned AliasedCheckLimit = 10;
@@ -855,6 +861,13 @@ static std::optional<unsigned> getExtractIndex(const Instruction *E) {
return *EI->idx_begin();
}
+namespace llvm {
+/// Checks if the specified value does not require scheduling. It does not
+/// require scheduling if all operands and all users do not need to be scheduled
+/// in the current basic block.
+static bool doesNotNeedToBeScheduled(Value *V);
+} // namespace llvm
+
namespace {
/// \returns true if \p Opcode is allowed as part of the main/alternate
/// instruction for SLP vectorization.
@@ -957,6 +970,33 @@ class BinOpSameOpcodeHelper {
return Instruction::Xor;
llvm_unreachable("Cannot find interchangeable instruction.");
}
+
+ /// Return true if the instruction can be converted to \p Opcode.
+ bool hasCandidateOpcode(unsigned Opcode) const {
+ MaskType Candidate = Mask & SeenBefore;
+ switch (Opcode) {
+ case Instruction::Shl:
+ return Candidate & ShlBIT;
+ case Instruction::AShr:
+ return Candidate & AShrBIT;
+ case Instruction::Mul:
+ return Candidate & MulBIT;
+ case Instruction::Add:
+ return Candidate & AddBIT;
+ case Instruction::Sub:
+ return Candidate & SubBIT;
+ case Instruction::And:
+ return Candidate & AndBIT;
+ case Instruction::Or:
+ return Candidate & OrBIT;
+ case Instruction::Xor:
+ return Candidate & XorBIT;
+ default:
+ break;
+ }
+ llvm_unreachable("Cannot find interchangeable instruction.");
+ }
+
SmallVector<Value *> getOperand(const Instruction *To) const {
unsigned ToOpcode = To->getOpcode();
unsigned FromOpcode = I->getOpcode();
@@ -1117,6 +1157,10 @@ public:
AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
}
unsigned getMainOpcode() const { return MainOp.getOpcode(); }
+ /// Checks if the list of potential opcodes includes \p Opcode.
+ bool hasCandidateOpcode(unsigned Opcode) const {
+ return MainOp.hasCandidateOpcode(Opcode);
+ }
bool hasAltOp() const { return AltOp.I; }
unsigned getAltOpcode() const {
return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
@@ -1152,6 +1196,8 @@ class InstructionsState {
/// GetVectorCost.
Instruction *MainOp = nullptr;
Instruction *AltOp = nullptr;
+ /// Wether the instruction state represents copyable instructions.
+ bool HasCopyables = false;
public:
Instruction *getMainOp() const {
@@ -1190,9 +1236,11 @@ public:
if (!I->isBinaryOp())
return nullptr;
BinOpSameOpcodeHelper Converter(MainOp);
- if (Converter.add(I) && Converter.add(MainOp) && !Converter.hasAltOp())
- return MainOp;
- return AltOp;
+ if (!Converter.add(I) || !Converter.add(MainOp))
+ return nullptr;
+ if (Converter.hasAltOp() && !isAltShuffle())
+ return nullptr;
+ return Converter.hasAltOp() ? AltOp : MainOp;
}
/// Checks if main/alt instructions are shift operations.
@@ -1237,9 +1285,63 @@ public:
explicit operator bool() const { return valid(); }
InstructionsState() = delete;
- InstructionsState(Instruction *MainOp, Instruction *AltOp)
- : MainOp(MainOp), AltOp(AltOp) {}
+ InstructionsState(Instruction *MainOp, Instruction *AltOp,
+ bool HasCopyables = false)
+ : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
static InstructionsState invalid() { return {nullptr, nullptr}; }
+
+ bool isCopyableElement(Value *V) const {
+ assert(valid() && "InstructionsState is invalid.");
+ if (!HasCopyables)
+ return false;
+ if (isAltShuffle() || getOpcode() == Instruction::GetElementPtr)
+ return false;
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ return !isa<PoisonValue>(V);
+ if (I->getParent() != MainOp->getParent() &&
+ (!isVectorLikeInstWithConstOps(I) ||
+ !isVectorLikeInstWithConstOps(MainOp)))
+ return true;
+ if (I->getOpcode() == MainOp->getOpcode())
+ return false;
+ if (!I->isBinaryOp())
+ return true;
+ BinOpSameOpcodeHelper Converter(MainOp);
+ return !Converter.add(I) || !Converter.add(MainOp) ||
+ Converter.hasAltOp() || !Converter.hasCandidateOpcode(getOpcode());
+ }
+
+ /// Checks if the value is non-schedulable.
+ bool isNonSchedulable(Value *V) const {
+ assert(valid() && "InstructionsState is invalid.");
+ auto *I = dyn_cast<Instruction>(V);
+ if (!HasCopyables)
+ return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||
+ doesNotNeedToBeScheduled(V);
+ // MainOp for copyables always schedulable to correctly identify
+ // non-schedulable copyables.
+ if (isCopyableElement(V)) {
+ auto IsNonSchedulableCopyableElement = [this](Value *V) {
+ auto *I = dyn_cast<Instruction>(V);
+ return !I || isa<PHINode>(I) || I->getParent() != MainOp->getParent() ||
+ (doesNotNeedToBeScheduled(I) &&
+ // If the copyable instructions comes after MainOp
+ // (non-schedulable, but used in the block) - cannot vectorize
+ // it, will possibly generate use before def.
+ (isVectorLikeInstWithConstOps(I) || !MainOp->comesBefore(I)));
+ };
+
+ return IsNonSchedulableCopyableElement(V);
+ }
+ return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||
+ doesNotNeedToBeScheduled(V);
+ }
+
+ bool areInstructionsWithCopyableElements() const {
+ assert(valid() && "InstructionsState is invalid.");
+ return HasCopyables;
+ }
};
std::pair<Instruction *, SmallVector<Value *>>
@@ -1917,6 +2019,7 @@ public:
CompressEntryToData.clear();
ExternalUses.clear();
ExternalUsesAsOriginalScalar.clear();
+ ExternalUsesWithNonUsers.clear();
for (auto &Iter : BlocksSchedules) {
BlockScheduling *BS = Iter.second.get();
BS->clear();
@@ -2899,9 +3002,6 @@ public:
for (OperandDataVec &Ops : OpsVec)
Ops.resize(NumLanes);
for (unsigned Lane : seq<unsigned>(NumLanes)) {
- Value *V = VL[Lane];
- assert((isa<Instruction>(V) || isa<PoisonValue>(V)) &&
- "Expected instruction or poison value");
// Our tree has just 3 nodes: the root and two operands.
// It is therefore trivial to get the APO. We only need to check the
// opcode of V and whether the operand at OpIdx is the LHS or RHS
@@ -2912,17 +3012,24 @@ public:
// Since operand reordering is performed on groups of commutative
// operations or alternating sequences (e.g., +, -), we can safely tell
// the inverse operations by checking commutativity.
- if (isa<PoisonValue>(V)) {
+ auto *I = dyn_cast<Instruction>(VL[Lane]);
+ if (!I && isa<PoisonValue>(VL[Lane])) {
for (unsigned OpIdx : seq<unsigned>(NumOperands))
OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], true, false};
continue;
}
- auto [SelectedOp, Ops] = convertTo(cast<Instruction>(V), S);
- // We cannot check commutativity by the converted instruction
- // (SelectedOp) because isCommutative also examines def-use
- // relationships.
- bool IsInverseOperation =
- !isCommutative(SelectedOp, cast<Instruction>(V));
+ bool IsInverseOperation = false;
+ if (S.isCopyableElement(VL[Lane])) {
+ // The value is a copyable element.
+ IsInverseOperation = !isCommutative(MainOp);
+ } else {
+ assert(I && "Expected instruction");
+ auto [SelectedOp, Ops] = convertTo(I, S);
+ // We cannot check commutativity by the converted instruction
+ // (SelectedOp) because isCommutative also examines def-use
+ // relationships.
+ IsInverseOperation = !isCommutative(SelectedOp, I);
+ }
for (unsigned OpIdx : seq<unsigned>(ArgSize)) {
bool APO = (OpIdx == 0) ? false : IsInverseOperation;
OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], APO, false};
@@ -3792,6 +3899,9 @@ private:
/// reordering of operands during buildTreeRec() and vectorizeTree().
SmallVector<ValueList, 2> Operands;
+ /// Copyable elements of the entry node.
+ SmallPtrSet<const Value *, 4> CopyableElements;
+
/// MainOp and AltOp are recorded inside. S should be obtained from
/// newTreeEntry.
InstructionsState S = InstructionsState::invalid();
@@ -3820,11 +3930,7 @@ private:
void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
/// Marks the node as one that does not require scheduling.
- void setDoesNotNeedToSchedule() {
- assert(::doesNotNeedToSchedule(Scalars) &&
- "Expected to not need scheduling");
- DoesNotNeedToSchedule = true;
- }
+ void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule = true; }
/// Returns true if the node is marked as one that does not require
/// scheduling.
bool doesNotNeedToSchedule() const { return DoesNotNeedToSchedule; }
@@ -3896,6 +4002,20 @@ private:
bool hasState() const { return S.valid(); }
+ /// Add \p V to the list of copyable elements.
+ void addCopyableElement(Value *V) {
+ assert(S.isCopyableElement(V) && "Not a copyable element.");
+ CopyableElements.insert(V);
+ }
+
+ /// Returns true if \p V is a copyable element.
+ bool isCopyableElement(Value *V) const {
+ return CopyableElements.contains(V);
+ }
+
+ /// Returns true if any scalar in the list is a copyable element.
+ bool hasCopyableElements() const { return !CopyableElements.empty(); }
+
/// When ReuseReorderShuffleIndices is empty it just returns position of \p
/// V within vector of Scalars. Otherwise, try to remap on its reuse index.
unsigned findLaneForValue(Value *V) const {
@@ -3968,6 +4088,8 @@ private:
for (Value *V : Scalars)
dbgs().indent(2) << *V << "\n";
dbgs() << "State: ";
+ if (S && hasCopyableElements())
+ dbgs() << "[[Copyable]] ";
switch (State) {
case Vectorize:
if (InterleaveFactor > 0) {
@@ -4145,12 +4267,20 @@ private:
}
}
} else if (!Last->isGather()) {
- if (doesNotNeedToSchedule(VL))
+ if (isa<PHINode>(S.getMainOp()) ||
+ isVectorLikeInstWithConstOps(S.getMainOp()) ||
+ (!S.areInstructionsWithCopyableElements() &&
+ doesNotNeedToSchedule(VL)) ||
+ all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))
Last->setDoesNotNeedToSchedule();
SmallPtrSet<Value *, 4> Processed;
for (Value *V : VL) {
if (isa<PoisonValue>(V))
continue;
+ if (S.isCopyableElement(V)) {
+ Last->addCopyableElement(V);
+ continue;
+ }
auto It = ScalarToTreeEntries.find(V);
if (It == ScalarToTreeEntries.end()) {
ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(Last);
@@ -4162,16 +4292,14 @@ private:
}
}
// Update the scheduler bundle to point to this TreeEntry.
- assert((!Bundle.getBundle().empty() || isa<PHINode>(S.getMainOp()) ||
- isVectorLikeInstWithConstOps(S.getMainOp()) ||
- Last->doesNotNeedToSchedule()) &&
+ assert((!Bundle.getBundle().empty() || Last->doesNotNeedToSchedule()) &&
"Bundle and VL out of sync");
if (!Bundle.getBundle().empty()) {
#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
auto *BundleMember = Bundle.getBundle().begin();
SmallPtrSet<Value *, 4> Processed;
for (Value *V : VL) {
- if (doesNotNeedToBeScheduled(V) || !Processed.insert(V).second)
+ if (S.isNonSchedulable(V) || !Processed.insert(V).second)
continue;
++BundleMember;
}
@@ -4280,7 +4408,8 @@ private:
/// in general.
ScalarsVectorizationLegality
getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,
- const EdgeInfo &UserTreeIdx) const;
+ const EdgeInfo &UserTreeIdx,
+ bool TryCopyableElementsVectorization) const;
/// Checks if the specified list of the instructions/values can be vectorized
/// and fills required data before actual scheduling of the instructions.
@@ -4420,6 +4549,10 @@ private:
/// extractelement instructions.
SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
+ /// A list of scalar to be extracted without specific user necause of too many
+ /// uses.
+ SmallPtrSet<Value *, 4> ExternalUsesWithNonUsers;
+
/// Values used only by @llvm.assume calls.
SmallPtrSet<const Value *, 32> EphValues;
@@ -4996,7 +5129,8 @@ private:
/// Build a bundle from the ScheduleData nodes corresponding to the
/// scalar instruction for each lane.
- ScheduleBundle &buildBundle(ArrayRef<Value *> VL);
+ ScheduleBundle &buildBundle(ArrayRef<Value *> VL,
+ const InstructionsState &S);
/// Checks if a bundle of instructions can be scheduled, i.e. has no
/// cyclic dependencies. This is only a dry-run, no instructions are
@@ -6727,7 +6861,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
return std::move(ResOrder);
}
if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
- (!TE.UserTreeIndex ||
+ (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() ||
!Instruction::isBinaryOp(TE.UserTreeIndex.UserTE->getOpcode())) &&
(TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
return std::nullopt;
@@ -7038,10 +7172,11 @@ bool BoUpSLP::isProfitableToReorder() const {
VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
VectorizableTree.front()->ReorderIndices.empty()) {
// Check if the tree has only single store and single (unordered) load node,
- // other nodes are phis or geps/binops, combined with phis, and/orsingle
+ // other nodes are phis or geps/binops, combined with phis, and/or single
// gather load node
bool HasPhis = false;
- if (VectorizableTree.front()->getOpcode() == Instruction::PHI &&
+ if (VectorizableTree.front()->hasState() &&
+ VectorizableTree.front()->getOpcode() == Instruction::PHI &&
VectorizableTree.front()->Scalars.size() == TinyVF &&
VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
return false;
@@ -7049,6 +7184,8 @@ bool BoUpSLP::isProfitableToReorder() const {
unsigned GatherLoads = 0;
for (const std::unique_ptr<TreeEntry> &TE :
ArrayRef(VectorizableTree).drop_front()) {
+ if (TE->State == TreeEntry::SplitVectorize)
+ continue;
if (!TE->hasState()) {
if (all_of(TE->Scalars, IsaPred<Constant, PHINode>) ||
all_of(TE->Scalars, IsaPred<BinaryOperator, PHINode>))
@@ -7072,7 +7209,10 @@ bool BoUpSLP::isProfitableToReorder() const {
if (TE->getOpcode() == Instruction::GetElementPtr ||
Instruction::isBinaryOp(TE->getOpcode()))
continue;
- if (TE->getOpcode() != Instruction::PHI)
+ if (TE->getOpcode() != Instruction::PHI &&
+ (!TE->hasCopyableElements() ||
+ static_cast<unsigned>(count_if(TE->Scalars, IsaPred<PHINode>)) <
+ TE->Scalars.size() / 2))
return true;
if (VectorizableTree.front()->Scalars.size() == TinyVF &&
TE->getNumOperands() > PhiOpsLimit)
@@ -7860,7 +8000,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
}
Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const {
- if ((Entry.getOpcode() == Instruction::Store ||
+ if (Entry.hasState() &&
+ (Entry.getOpcode() == Instruction::Store ||
Entry.getOpcode() == Instruction::Load) &&
Entry.State == TreeEntry::StridedVectorize &&
!Entry.ReorderIndices.empty() && isReverseOrder(Entry.ReorderIndices))
@@ -7870,7 +8011,9 @@ Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const {
void BoUpSLP::buildExternalUses(
const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
+ const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
DenseMap<Value *, unsigned> ScalarToExtUses;
+ SmallPtrSet<Value *, 4> ExternalUsers;
// Collect the values that we need to extract from the tree.
for (auto &TEPtr : VectorizableTree) {
TreeEntry *Entry = TEPtr.get();
@@ -7882,13 +8025,24 @@ void BoUpSLP::buildExternalUses(
// For each lane:
for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
Value *Scalar = Entry->Scalars[Lane];
- if (!isa<Instruction>(Scalar))
+ if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
continue;
+
// All uses must be replaced already? No need to do it again.
auto It = ScalarToExtUses.find(Scalar);
if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
continue;
+ if (Scalar->hasNUsesOrMore(NumVectScalars)) {
+ unsigned FoundLane = Entry->findLaneForValue(Scalar);
+ LLVM_DEBUG(dbgs() << "SLP: Need to extract from lane " << FoundLane
+ << " from " << *Scalar << "for many users.\n");
+ It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
+ ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
+ ExternalUsesWithNonUsers.insert(Scalar);
+ continue;
+ }
+
// Check if the scalar is externally used as an extra arg.
const auto ExtI = ExternallyUsedValues.find(Scalar);
if (ExtI != ExternallyUsedValues.end()) {
@@ -7916,7 +8070,10 @@ void BoUpSLP::buildExternalUses(
// Some in-tree scalars will remain as scalar in vectorized
// instructions. If that is the case, the one in FoundLane will
// be used.
- if (all_of(UseEntries, [&](TreeEntry *UseEntry) {
+ if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
+ isa<LoadInst, StoreInst>(UserInst)) ||
+ isa<CallInst>(UserInst)) ||
+ all_of(UseEntries, [&](TreeEntry *UseEntry) {
return UseEntry->State == TreeEntry::ScatterVectorize ||
!doesInTreeUserNeedToExtract(
Scalar, getRootEntryInstruction(*UseEntry), TLI,
@@ -7946,6 +8103,7 @@ void BoUpSLP::buildExternalUses(
<< ".\n");
It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
+ ExternalUsesWithNonUsers.insert(Scalar);
if (!U)
break;
}
@@ -9612,7 +9770,8 @@ static bool tryToFindDuplicates(SmallVectorImpl<Value *> &VL,
PoisonValue::get(UniqueValues.front()->getType()));
// Check that extended with poisons operations are still valid for
// vectorization (div/rem are not allowed).
- if (!getSameOpcode(PaddedUniqueValues, TLI).valid()) {
+ if (!S.areInstructionsWithCopyableElements() &&
+ !getSameOpcode(PaddedUniqueValues, TLI).valid()) {
LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
ReuseShuffleIndices.clear();
return false;
@@ -9761,13 +9920,95 @@ bool BoUpSLP::canBuildSplitNode(ArrayRef<Value *> VL,
}
namespace {
-/// Class accepts incoming list of values and generates the list of values
-/// for scheduling and list of operands for the new nodes.
+/// Class accepts incoming list of values, checks if it is able to model
+/// "copyable" values as compatible operations, and generates the list of values
+/// for scheduling and list of operands doe the new nodes.
class InstructionsCompatibilityAnalysis {
DominatorTree &DT;
const DataLayout &DL;
const TargetTransformInfo &TTI;
const TargetLibraryInfo &TLI;
+ unsigned MainOpcode = 0;
+ Instruction *MainOp = nullptr;
+
+ /// Identifies the best candidate value, which represents main opcode
+ /// operation.
+ /// Currently the best candidate is the Add instruction with the parent
+ /// block with the highest DFS incoming number (block, that dominates other).
+ void findAndSetMainInstruction(ArrayRef<Value *> VL) {
+ BasicBlock *Parent = nullptr;
+ // Checks if the instruction has supported opcode.
+ auto IsSupportedOpcode = [](Instruction *I) {
+ return I && I->getOpcode() == Instruction::Add;
+ };
+ SmallDenseSet<Value *, 8> Operands;
+ for (Value *V : VL) {
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ continue;
+ if (!DT.isReachableFromEntry(I->getParent()))
+ continue;
+ if (!MainOp) {
+ MainOp = I;
+ Parent = I->getParent();
+ Operands.insert(I->op_begin(), I->op_end());
+ continue;
+ }
+ if (Parent == I->getParent()) {
+ if (!IsSupportedOpcode(MainOp))
+ MainOp = I;
+ if (MainOp->getOpcode() == I->getOpcode() &&
+ doesNotNeedToBeScheduled(MainOp) && !doesNotNeedToBeScheduled(I))
+ MainOp = I;
+ Operands.insert(I->op_begin(), I->op_end());
+ continue;
+ }
+ auto *NodeA = DT.getNode(Parent);
+ auto *NodeB = DT.getNode(I->getParent());
+ assert(NodeA && "Should only process reachable instructions");
+ assert(NodeB && "Should only process reachable instructions");
+ assert((NodeA == NodeB) ==
+ (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
+ "Different nodes should have different DFS numbers");
+ if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
+ MainOp = I;
+ Parent = I->getParent();
+ Operands.clear();
+ Operands.insert(I->op_begin(), I->op_end());
+ }
+ }
+ if (!IsSupportedOpcode(MainOp) || Operands.contains(MainOp)) {
+ MainOp = nullptr;
+ return;
+ }
+ MainOpcode = MainOp->getOpcode();
+ }
+
+ /// Returns the idempotent value for the \p MainOp with the detected \p
+ /// MainOpcode. For Add, returns 0. For Or, it should choose between false and
+ /// the operand itself, since V or V == V.
+ Value *selectBestIdempotentValue() const {
+ assert(MainOpcode == Instruction::Add && "Unsupported opcode");
+ return ConstantExpr::getBinOpIdentity(MainOpcode, MainOp->getType(),
+ !MainOp->isCommutative());
+ }
+
+ /// Returns the value and operands for the \p V, considering if it is original
+ /// instruction and its actual operands should be returned, or it is a
+ /// copyable element and its should be represented as idempotent instruction.
+ SmallVector<Value *> getOperands(const InstructionsState &S, Value *V) const {
+ if (isa<PoisonValue>(V))
+ return {V, V};
+ if (!S.isCopyableElement(V))
+ return convertTo(cast<Instruction>(V), S).second;
+ switch (MainOpcode) {
+ case Instruction::Add:
+ return {V, selectBestIdempotentValue()};
+ default:
+ break;
+ }
+ llvm_unreachable("Unsupported opcode");
+ }
/// Builds operands for the original instructions.
void
@@ -9928,22 +10169,165 @@ public:
const TargetLibraryInfo &TLI)
: DT(DT), DL(DL), TTI(TTI), TLI(TLI) {}
+ InstructionsState
+ buildInstructionsState(ArrayRef<Value *> VL, const BoUpSLP &R,
+ bool TryCopyableElementsVectorization,
+ bool WithProfitabilityCheck = false,
+ bool SkipSameCodeCheck = false) {
+ InstructionsState S = (SkipSameCodeCheck || !allSameBlock(VL))
+ ? InstructionsState::invalid()
+ : getSameOpcode(VL, TLI);
+ if (S)
+ return S;
+ if (!VectorizeCopyableElements || !TryCopyableElementsVectorization)
+ return S;
+ findAndSetMainInstruction(VL);
+ if (!MainOp)
+ return InstructionsState::invalid();
+ S = InstructionsState(MainOp, MainOp, /*HasCopyables=*/true);
+ // TODO: Remove this check once support for schulable copyables is landed.
+ if (any_of(VL, [&](Value *V) {
+ return S.isCopyableElement(V) && !S.isNonSchedulable(V);
+ }))
+ return InstructionsState::invalid();
+
+ if (!WithProfitabilityCheck)
+ return S;
+ // Check if it is profitable to vectorize the instruction.
+ SmallVector<BoUpSLP::ValueList> Operands = buildOperands(S, VL);
+ auto BuildCandidates =
+ [](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates, Value *V1,
+ Value *V2) {
+ if (V1 != V2 && isa<PHINode>(V1))
+ return;
+ auto *I1 = dyn_cast<Instruction>(V1);
+ auto *I2 = dyn_cast<Instruction>(V2);
+ if (I1 && I2 && I1->getOpcode() == I2->getOpcode() &&
+ I1->getParent() != I2->getParent())
+ return;
+ Candidates.emplace_back(V1, (I1 || I2) ? V2 : V1);
+ };
+ if (VL.size() == 2) {
+ // Check if the operands allow better vectorization.
+ SmallVector<std::pair<Value *, Value *>, 4> Candidates1, Candidates2;
+ BuildCandidates(Candidates1, Operands[0][0], Operands[0][1]);
+ BuildCandidates(Candidates2, Operands[1][0], Operands[1][1]);
+ bool Res = !Candidates1.empty() && !Candidates2.empty() &&
+ R.findBestRootPair(Candidates1) &&
+ R.findBestRootPair(Candidates2);
+ if (!Res && isCommutative(MainOp)) {
+ Candidates1.clear();
+ Candidates2.clear();
+ BuildCandidates(Candidates1, Operands[0][0], Operands[1][1]);
+ BuildCandidates(Candidates2, Operands[1][0], Operands[0][1]);
+ Res = !Candidates1.empty() && !Candidates2.empty() &&
+ R.findBestRootPair(Candidates1) &&
+ R.findBestRootPair(Candidates2);
+ }
+ if (!Res)
+ return InstructionsState::invalid();
+ return S;
+ }
+ assert(Operands.size() == 2 && "Unexpected number of operands!");
+ unsigned CopyableNum =
+ count_if(VL, [&](Value *V) { return S.isCopyableElement(V); });
+ if (CopyableNum < VL.size() / 2)
+ return S;
+ // Too many phi copyables - exit.
+ const unsigned Limit = VL.size() / 24;
+ if ((CopyableNum >= VL.size() - Limit ||
+ (CopyableNum >= VL.size() - 1 && VL.size() > 4) ||
+ CopyableNum >= MaxPHINumOperands) &&
+ all_of(VL, [&](Value *V) {
+ return isa<PHINode>(V) || !S.isCopyableElement(V);
+ }))
+ return InstructionsState::invalid();
+ // Check profitability if number of copyables > VL.size() / 2.
+ // 1. Reorder operands for better matching.
+ if (isCommutative(MainOp)) {
+ for (auto &Ops : Operands) {
+ // Make instructions the first operands.
+ if (!isa<Instruction>(Ops.front()) && isa<Instruction>(Ops.back())) {
+ std::swap(Ops.front(), Ops.back());
+ continue;
+ }
+ // Make constants the second operands.
+ if (isa<Constant>(Ops.front())) {
+ std::swap(Ops.front(), Ops.back());
+ continue;
+ }
+ }
+ }
+ // 2. Check, if operands can be vectorized.
+ if (count_if(Operands.back(), IsaPred<Instruction>) > 1)
+ return InstructionsState::invalid();
+ auto CheckOperand = [&](ArrayRef<Value *> Ops) {
+ if (allConstant(Ops) || isSplat(Ops))
+ return true;
+ // Check if it is "almost" splat, i.e. has >= 4 elements and only single
+ // one is different.
+ constexpr unsigned Limit = 4;
+ if (Operands.front().size() >= Limit) {
+ SmallDenseMap<const Value *, unsigned> Counters;
+ for (Value *V : Ops) {
+ if (isa<UndefValue>(V))
+ continue;
+ ++Counters[V];
+ }
+ if (Counters.size() == 2 &&
+ any_of(Counters, [&](const std::pair<const Value *, unsigned> &C) {
+ return C.second == 1;
+ }))
+ return true;
+ }
+ // First operand not a constant or splat? Last attempt - check for
+ // potential vectorization.
+ InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
+ InstructionsState OpS = Analysis.buildInstructionsState(
+ Ops, R, /*TryCopyableElementsVectorization=*/true);
+ if (!OpS || (OpS.getOpcode() == Instruction::PHI && !allSameBlock(Ops)))
+ return false;
+ unsigned CopyableNum =
+ count_if(Ops, [&](Value *V) { return OpS.isCopyableElement(V); });
+ return CopyableNum <= VL.size() / 2;
+ };
+ if (!CheckOperand(Operands.front()))
+ return InstructionsState::invalid();
+
+ return S;
+ }
+
SmallVector<BoUpSLP::ValueList> buildOperands(const InstructionsState &S,
ArrayRef<Value *> VL) {
assert(S && "Invalid state!");
SmallVector<BoUpSLP::ValueList> Operands;
- buildOriginalOperands(S, VL, Operands);
+ if (S.areInstructionsWithCopyableElements()) {
+ MainOp = S.getMainOp();
+ MainOpcode = S.getOpcode();
+ Operands.assign(MainOp->getNumOperands(),
+ BoUpSLP::ValueList(VL.size(), nullptr));
+ for (auto [Idx, V] : enumerate(VL)) {
+ SmallVector<Value *> OperandsForValue = getOperands(S, V);
+ for (auto [OperandIdx, Operand] : enumerate(OperandsForValue))
+ Operands[OperandIdx][Idx] = Operand;
+ }
+ } else {
+ buildOriginalOperands(S, VL, Operands);
+ }
return Operands;
}
};
} // namespace
-BoUpSLP::ScalarsVectorizationLegality
-BoUpSLP::getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,
- const EdgeInfo &UserTreeIdx) const {
+BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
+ ArrayRef<Value *> VL, unsigned Depth, const EdgeInfo &UserTreeIdx,
+ bool TryCopyableElementsVectorization) const {
assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
- InstructionsState S = getSameOpcode(VL, *TLI);
+ InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
+ InstructionsState S = Analysis.buildInstructionsState(
+ VL, *this, TryCopyableElementsVectorization,
+ /*WithProfitabilityCheck=*/true, TryCopyableElementsVectorization);
// Don't go into catchswitch blocks, which can happen with PHIs.
// Such blocks can only have PHIs and the catchswitch. There is no
@@ -10066,7 +10450,7 @@ BoUpSLP::getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,
bool IsScatterVectorizeUserTE =
UserTreeIdx.UserTE &&
UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
- bool AreAllSameBlock = S && allSameBlock(VL);
+ bool AreAllSameBlock = S.valid();
bool AreScatterAllGEPSameBlock =
(IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
VL.size() > 2 &&
@@ -10091,12 +10475,18 @@ BoUpSLP::getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,
NotProfitableForVectorization(VL)) {
if (!S) {
LLVM_DEBUG(dbgs() << "SLP: Try split and if failed, gathering due to "
- "C,S,B,O, small shuffle. \n");
+ "C,S,B,O, small shuffle. \n";
+ dbgs() << "[";
+ interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
+ dbgs() << "]\n");
return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
/*TryToFindDuplicates=*/true,
/*TrySplitVectorize=*/true);
}
- LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n";
+ dbgs() << "[";
+ interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
+ dbgs() << "]\n");
return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
}
@@ -10242,9 +10632,29 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
return true;
};
- ScalarsVectorizationLegality Legality =
- getScalarsVectorizationLegality(VL, Depth, UserTreeIdx);
- const InstructionsState &S = Legality.getInstructionsState();
+ auto AreOnlyConstsWithPHIs = [](ArrayRef<Value *> VL) {
+ bool AreConsts = false;
+ for (Value *V : VL) {
+ if (isa<PoisonValue>(V))
+ continue;
+ if (isa<Constant>(V)) {
+ AreConsts = true;
+ continue;
+ }
+ if (!isa<PHINode>(V))
+ return false;
+ }
+ return AreConsts;
+ };
+ if (AreOnlyConstsWithPHIs(VL)) {
+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to all constants and PHIs.\n");
+ newGatherTreeEntry(VL, InstructionsState::invalid(), UserTreeIdx);
+ return;
+ }
+
+ ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality(
+ VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/false);
+ InstructionsState S = Legality.getInstructionsState();
if (!Legality.isLegal()) {
if (Legality.trySplitVectorize()) {
auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL);
@@ -10252,11 +10662,18 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
return;
}
- if (Legality.tryToFindDuplicates())
- tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S, UserTreeIdx);
+ if (!S)
+ Legality = getScalarsVectorizationLegality(
+ VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/true);
+ if (!Legality.isLegal()) {
+ if (Legality.tryToFindDuplicates())
+ tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S,
+ UserTreeIdx);
- newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
- return;
+ newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
+ return;
+ }
+ S = Legality.getInstructionsState();
}
// FIXME: investigate if there are profitable cases for VL.size() <= 4.
@@ -13024,7 +13441,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
assert(E->getOpcode() &&
((allSameType(VL) && allSameBlock(VL)) ||
(E->getOpcode() == Instruction::GetElementPtr &&
- E->getMainOp()->getType()->isPointerTy())) &&
+ E->getMainOp()->getType()->isPointerTy()) ||
+ E->hasCopyableElements()) &&
"Invalid VL");
Instruction *VL0 = E->getMainOp();
unsigned ShuffleOrOp =
@@ -13036,6 +13454,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
SmallBitVector UsedScalars(Sz, false);
for (unsigned I = 0; I < Sz; ++I) {
if (isa<Instruction>(UniqueValues[I]) &&
+ !E->isCopyableElement(UniqueValues[I]) &&
getTreeEntries(UniqueValues[I]).front() == E)
continue;
UsedScalars.set(I);
@@ -14075,15 +14494,45 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
// If the tree contains only phis, buildvectors, split nodes and
// small nodes with reuses, we can skip it.
+ SmallVector<const TreeEntry *> StoreLoadNodes;
+ unsigned NumGathers = 0;
+ constexpr int LimitTreeSize = 36;
if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
- all_of(VectorizableTree, [](const std::unique_ptr<TreeEntry> &TE) {
- return TE->State == TreeEntry::SplitVectorize ||
- (TE->isGather() &&
- none_of(TE->Scalars, IsaPred<ExtractElementInst>)) ||
- (TE->hasState() && (TE->getOpcode() == Instruction::PHI ||
- (!TE->ReuseShuffleIndices.empty() &&
- TE->Scalars.size() == 2)));
- }))
+ all_of(VectorizableTree,
+ [&](const std::unique_ptr<TreeEntry> &TE) {
+ if (!TE->isGather() && TE->hasState() &&
+ (TE->getOpcode() == Instruction::Load ||
+ TE->getOpcode() == Instruction::Store)) {
+ StoreLoadNodes.push_back(TE.get());
+ return true;
+ }
+ if (TE->isGather())
+ ++NumGathers;
+ return TE->State == TreeEntry::SplitVectorize ||
+ (TE->Idx == 0 && TE->Scalars.size() == 2 &&
+ TE->hasState() && TE->getOpcode() == Instruction::ICmp &&
+ VectorizableTree.size() > LimitTreeSize) ||
+ (TE->isGather() &&
+ none_of(TE->Scalars, IsaPred<ExtractElementInst>)) ||
+ (TE->hasState() &&
+ (TE->getOpcode() == Instruction::PHI ||
+ (TE->hasCopyableElements() &&
+ static_cast<unsigned>(count_if(
+ TE->Scalars, IsaPred<PHINode, Constant>)) >=
+ TE->Scalars.size() / 2) ||
+ ((!TE->ReuseShuffleIndices.empty() ||
+ !TE->ReorderIndices.empty() || TE->isAltShuffle()) &&
+ TE->Scalars.size() == 2)));
+ }) &&
+ (StoreLoadNodes.empty() ||
+ (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.size() &&
+ (NumGathers > 0 || none_of(StoreLoadNodes, [&](const TreeEntry *TE) {
+ return TE->getOpcode() == Instruction::Store ||
+ all_of(TE->Scalars, [&](Value *V) {
+ return !isa<LoadInst>(V) ||
+ areAllUsersVectorized(cast<Instruction>(V));
+ });
+ })))))
return true;
// We can vectorize the tree if its size is greater than or equal to the
@@ -14826,6 +15275,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
bool IsProfitablePHIUser =
(KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
VectorizableTree.front()->Scalars.size() > 2)) &&
+ VectorizableTree.front()->hasState() &&
VectorizableTree.front()->getOpcode() == Instruction::PHI &&
!Inst->hasNUsesOrMore(UsesLimit) &&
none_of(Inst->users(),
@@ -15276,7 +15726,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
const BasicBlock *TEInsertBlock = nullptr;
// Main node of PHI entries keeps the correct order of operands/incoming
// blocks.
- if (auto *PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp());
+ if (auto *PHI = dyn_cast_or_null<PHINode>(
+ TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() : nullptr);
PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
TEInsertPt = TEInsertBlock->getTerminator();
@@ -15375,7 +15826,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
"Expected only single user of a gather node.");
const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
- PHINode *UserPHI = UseEI.UserTE->State != TreeEntry::SplitVectorize
+ PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&
+ UseEI.UserTE->hasState())
? dyn_cast<PHINode>(UseEI.UserTE->getMainOp())
: nullptr;
Instruction *InsertPt =
@@ -15388,7 +15840,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
TEUseEI.UserTE->isAltShuffle()) &&
all_of(TEUseEI.UserTE->Scalars, isUsedOutsideBlock)) {
if (UseEI.UserTE->State != TreeEntry::Vectorize ||
- (UseEI.UserTE->getOpcode() == Instruction::PHI &&
+ (UseEI.UserTE->hasState() &&
+ UseEI.UserTE->getOpcode() == Instruction::PHI &&
!UseEI.UserTE->isAltShuffle()) ||
!all_of(UseEI.UserTE->Scalars, isUsedOutsideBlock))
continue;
@@ -16009,25 +16462,32 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
Instruction *Res = nullptr;
// Get the basic block this bundle is in. All instructions in the bundle
// should be in this block (except for extractelement-like instructions with
- // constant indices or gathered loads).
- auto *Front = E->getMainOp();
+ // constant indices or gathered loads or copyables).
+ Instruction *Front;
+ unsigned Opcode;
+ if (E->hasState()) {
+ Front = E->getMainOp();
+ Opcode = E->getOpcode();
+ } else {
+ Front = cast<Instruction>(*find_if(E->Scalars, IsaPred<Instruction>));
+ Opcode = Front->getOpcode();
+ }
auto *BB = Front->getParent();
- assert(((GatheredLoadsEntriesFirst.has_value() &&
- E->getOpcode() == Instruction::Load && E->isGather() &&
- E->Idx < *GatheredLoadsEntriesFirst) ||
- E->State == TreeEntry::SplitVectorize ||
- all_of(E->Scalars,
- [=](Value *V) -> bool {
- if (E->getOpcode() == Instruction::GetElementPtr &&
- !isa<GetElementPtrInst>(V))
- return true;
- auto *I = dyn_cast<Instruction>(V);
- return !I || !E->getMatchingMainOpOrAltOp(I) ||
- I->getParent() == BB ||
- isVectorLikeInstWithConstOps(I);
- })) &&
- "Expected gathered loads or GEPs or instructions from same basic "
- "block.");
+ assert(
+ ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&
+ E->isGather() && E->Idx < *GatheredLoadsEntriesFirst) ||
+ E->State == TreeEntry::SplitVectorize || E->hasCopyableElements() ||
+ all_of(E->Scalars,
+ [=](Value *V) -> bool {
+ if (Opcode == Instruction::GetElementPtr &&
+ !isa<GetElementPtrInst>(V))
+ return true;
+ auto *I = dyn_cast<Instruction>(V);
+ return !I || !E->getMatchingMainOpOrAltOp(I) ||
+ I->getParent() == BB || isVectorLikeInstWithConstOps(I);
+ })) &&
+ "Expected gathered loads or GEPs or instructions from same basic "
+ "block.");
auto FindLastInst = [&]() {
Instruction *LastInst = Front;
@@ -16035,18 +16495,20 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
auto *I = dyn_cast<Instruction>(V);
if (!I)
continue;
+ if (E->isCopyableElement(I))
+ continue;
if (LastInst->getParent() == I->getParent()) {
if (LastInst->comesBefore(I))
LastInst = I;
continue;
}
- assert(((E->getOpcode() == Instruction::GetElementPtr &&
+ assert(((Opcode == Instruction::GetElementPtr &&
!isa<GetElementPtrInst>(I)) ||
E->State == TreeEntry::SplitVectorize ||
(isVectorLikeInstWithConstOps(LastInst) &&
isVectorLikeInstWithConstOps(I)) ||
(GatheredLoadsEntriesFirst.has_value() &&
- E->getOpcode() == Instruction::Load && E->isGather() &&
+ Opcode == Instruction::Load && E->isGather() &&
E->Idx < *GatheredLoadsEntriesFirst)) &&
"Expected vector-like or non-GEP in GEP node insts only.");
if (!DT->isReachableFromEntry(LastInst->getParent())) {
@@ -16075,16 +16537,18 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
auto *I = dyn_cast<Instruction>(V);
if (!I)
continue;
+ if (E->isCopyableElement(I))
+ continue;
if (FirstInst->getParent() == I->getParent()) {
if (I->comesBefore(FirstInst))
FirstInst = I;
continue;
}
- assert(((E->getOpcode() == Instruction::GetElementPtr &&
- !isa<GetElementPtrInst>(I)) ||
- (isVectorLikeInstWithConstOps(FirstInst) &&
- isVectorLikeInstWithConstOps(I))) &&
- "Expected vector-like or non-GEP in GEP node insts only.");
+ assert(((Opcode == Instruction::GetElementPtr &&
+ !isa<GetElementPtrInst>(I)) ||
+ (isVectorLikeInstWithConstOps(FirstInst) &&
+ isVectorLikeInstWithConstOps(I))) &&
+ "Expected vector-like or non-GEP in GEP node insts only.");
if (!DT->isReachableFromEntry(FirstInst->getParent())) {
FirstInst = I;
continue;
@@ -16122,7 +16586,7 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
// Set insertpoint for gathered loads to the very first load.
if (GatheredLoadsEntriesFirst.has_value() &&
E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
- E->getOpcode() == Instruction::Load) {
+ Opcode == Instruction::Load) {
Res = FindFirstInst();
EntryToLastInstruction.try_emplace(E, Res);
return *Res;
@@ -16139,7 +16603,8 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
return nullptr;
for (Value *V : E->Scalars) {
auto *I = dyn_cast<Instruction>(V);
- if (!I || isa<PHINode>(I) || doesNotNeedToBeScheduled(I))
+ if (!I || isa<PHINode>(I) ||
+ (!E->isCopyableElement(I) && doesNotNeedToBeScheduled(I)))
continue;
ArrayRef<ScheduleBundle *> Bundles = It->second->getScheduleBundles(I);
if (Bundles.empty())
@@ -16153,13 +16618,13 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
};
const ScheduleBundle *Bundle = FindScheduleBundle(E);
if (!E->isGather() && !Bundle) {
- if ((E->getOpcode() == Instruction::GetElementPtr &&
+ if ((Opcode == Instruction::GetElementPtr &&
any_of(E->Scalars,
[](Value *V) {
return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
})) ||
- all_of(E->Scalars, [](Value *V) {
- return isa<PoisonValue>(V) ||
+ all_of(E->Scalars, [&](Value *V) {
+ return isa<PoisonValue>(V) || E->isCopyableElement(V) ||
(!isVectorLikeInstWithConstOps(V) && isUsedOutsideBlock(V));
}))
Res = FindLastInst();
@@ -18640,6 +19105,7 @@ Value *BoUpSLP::vectorizeTree(
TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
(TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
TE->UserTreeIndex.UserTE->isAltShuffle()) &&
+ !TE->UserTreeIndex.UserTE->hasCopyableElements() &&
all_of(TE->UserTreeIndex.UserTE->Scalars,
[](Value *V) { return isUsedOutsideBlock(V); })) {
Instruction &LastInst =
@@ -18903,7 +19369,7 @@ Value *BoUpSLP::vectorizeTree(
continue;
assert(
(ExternallyUsedValues.count(Scalar) ||
- Scalar->hasNUsesOrMore(UsesLimit) ||
+ ExternalUsesWithNonUsers.count(Scalar) ||
ExternalUsesAsOriginalScalar.contains(Scalar) ||
any_of(
Scalar->users(),
@@ -19182,7 +19648,7 @@ Value *BoUpSLP::vectorizeTree(
if (auto *EE = dyn_cast<ExtractElementInst>(Scalar);
EE && IgnoredExtracts.contains(EE))
continue;
- if (isa<PoisonValue>(Scalar))
+ if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
continue;
#ifndef NDEBUG
Type *Ty = Scalar->getType();
@@ -19424,12 +19890,15 @@ void BoUpSLP::optimizeGatherSequence() {
}
BoUpSLP::ScheduleBundle &
-BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
+BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL,
+ const InstructionsState &S) {
auto &BundlePtr =
ScheduledBundlesList.emplace_back(std::make_unique<ScheduleBundle>());
for (Value *V : VL) {
if (doesNotNeedToBeScheduled(V))
continue;
+ if (S.isCopyableElement(V))
+ continue;
ScheduleData *BundleMember = getScheduleData(V);
assert(BundleMember && "no ScheduleData for bundle member "
"(maybe not in same basic block)");
@@ -19450,10 +19919,19 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
const InstructionsState &S) {
// No need to schedule PHIs, insertelement, extractelement and extractvalue
// instructions.
+ bool HasCopyables = S.areInstructionsWithCopyableElements();
if (isa<PHINode>(S.getMainOp()) ||
- isVectorLikeInstWithConstOps(S.getMainOp()) || doesNotNeedToSchedule(VL))
+ isVectorLikeInstWithConstOps(S.getMainOp()) ||
+ (!HasCopyables && doesNotNeedToSchedule(VL)) ||
+ all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))
return nullptr;
+ // TODO Remove once full support for copyables is landed.
+ assert(all_of(VL,
+ [&](Value *V) {
+ return !S.isCopyableElement(V) || S.isNonSchedulable(V);
+ }) &&
+ "Copyable elements should not be schedulable");
// Initialize the instruction bundle.
Instruction *OldScheduleEnd = ScheduleEnd;
LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n");
@@ -19499,7 +19977,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
// Make sure that the scheduling region contains all
// instructions of the bundle.
for (Value *V : VL) {
- if (doesNotNeedToBeScheduled(V))
+ if (doesNotNeedToBeScheduled(V) || S.isCopyableElement(V))
continue;
if (!extendSchedulingRegion(V, S)) {
// If the scheduling region got new instructions at the lower end (or it
@@ -19516,7 +19994,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
bool ReSchedule = false;
for (Value *V : VL) {
- if (doesNotNeedToBeScheduled(V))
+ if (doesNotNeedToBeScheduled(V) || S.isCopyableElement(V))
continue;
ScheduleData *BundleMember = getScheduleData(V);
assert(BundleMember &&
@@ -19541,7 +20019,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
ReSchedule = true;
}
- ScheduleBundle &Bundle = buildBundle(VL);
+ ScheduleBundle &Bundle = buildBundle(VL, S);
TryScheduleBundleImpl(ReSchedule, Bundle);
if (!Bundle.isReady()) {
for (ScheduleData *BD : Bundle.getBundle()) {
@@ -19558,7 +20036,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
}
ScheduledBundlesList.pop_back();
for (Value *V : VL) {
- if (doesNotNeedToBeScheduled(V))
+ if (doesNotNeedToBeScheduled(V) || S.isCopyableElement(V))
continue;
ScheduledBundles.find(cast<Instruction>(V))->getSecond().pop_back();
}
@@ -20187,7 +20665,7 @@ bool BoUpSLP::collectValuesToDemote(
};
if (E.isGather() || !Visited.insert(&E).second ||
any_of(E.Scalars, [&](Value *V) {
- return !isa<PoisonValue>(V) && all_of(V->users(), [&](User *U) {
+ return !isa<Constant>(V) && all_of(V->users(), [&](User *U) {
return isa<InsertElementInst>(U) && !isVectorized(U);
});
}))
@@ -20555,9 +21033,10 @@ void BoUpSLP::computeMinimumValueSizes() {
if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
SelectInst>(U) ||
isa<SIToFPInst, UIToFPInst>(U) ||
- !isa<CastInst, BinaryOperator, FreezeInst, PHINode,
- SelectInst>(UserTE->getMainOp()) ||
- isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp()))
+ (UserTE->hasState() &&
+ (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
+ SelectInst>(UserTE->getMainOp()) ||
+ isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp()))))
return true;
unsigned UserTESz = DL->getTypeSizeInBits(
UserTE->Scalars.front()->getType());
@@ -20653,7 +21132,12 @@ void BoUpSLP::computeMinimumValueSizes() {
if (!IsKnownPositive)
++BitWidth1;
- APInt Mask = DB->getDemandedBits(cast<Instruction>(Root));
+ auto *I = dyn_cast<Instruction>(Root);
+ if (!I) {
+ MaxBitWidth = std::max(BitWidth1, MaxBitWidth);
+ continue;
+ }
+ APInt Mask = DB->getDemandedBits(I);
unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
MaxBitWidth =
std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
@@ -20802,6 +21286,7 @@ void BoUpSLP::computeMinimumValueSizes() {
NodeIdx < VectorizableTree.size() &&
VectorizableTree[NodeIdx]->UserTreeIndex &&
VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
+ VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
Instruction::Trunc &&
!VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
@@ -20982,7 +21467,9 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
for (Value *V : Chain)
ValOps.insert(cast<StoreInst>(V)->getValueOperand());
// Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
- InstructionsState S = getSameOpcode(ValOps.getArrayRef(), *TLI);
+ InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
+ InstructionsState S = Analysis.buildInstructionsState(
+ ValOps.getArrayRef(), R, /*TryCopyableElementsVectorization=*/true);
if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
DenseSet<Value *> Stores(Chain.begin(), Chain.end());
bool IsAllowedSize =
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 40a5565..25b9616 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -962,7 +962,11 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
BackedgeTakenCount->setUnderlyingValue(TCMO);
}
- VectorTripCount.setUnderlyingValue(VectorTripCountV);
+ if (!VectorTripCount.getUnderlyingValue())
+ VectorTripCount.setUnderlyingValue(VectorTripCountV);
+ else
+ assert(VectorTripCount.getUnderlyingValue() == VectorTripCountV &&
+ "VectorTripCount set earlier must much VectorTripCountV");
IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
// FIXME: Model VF * UF computation completely in VPlan.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index db40ce2..6655149 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -906,10 +906,10 @@ template <unsigned PartOpIdx> class LLVM_ABI_FOR_TEST VPUnrollPartAccessor {
protected:
/// Return the VPValue operand containing the unroll part or null if there is
/// no such operand.
- VPValue *getUnrollPartOperand(VPUser &U) const;
+ VPValue *getUnrollPartOperand(const VPUser &U) const;
/// Return the unroll part.
- unsigned getUnrollPart(VPUser &U) const;
+ unsigned getUnrollPart(const VPUser &U) const;
};
/// Helper to manage IR metadata for recipes. It filters out metadata that
@@ -1662,6 +1662,8 @@ struct LLVM_ABI_FOR_TEST VPWidenSelectRecipe : public VPRecipeWithIRFlags,
VPSlotTracker &SlotTracker) const override;
#endif
+ unsigned getOpcode() const { return Instruction::Select; }
+
VPValue *getCond() const {
return getOperand(0);
}
@@ -2335,8 +2337,9 @@ public:
return Idx == 0 ? getOperand(1) : getOperand(Idx * 2 + !isNormalized());
}
- /// Generate the phi/select nodes.
- void execute(VPTransformState &State) override;
+ void execute(VPTransformState &State) override {
+ llvm_unreachable("VPBlendRecipe should be expanded by simplifyBlends");
+ }
/// Return the cost of this VPWidenMemoryRecipe.
InstructionCost computeCost(ElementCount VF,
@@ -3483,7 +3486,7 @@ public:
/// Return true if this VPScalarIVStepsRecipe corresponds to part 0. Note that
/// this is only accurate after the VPlan has been unrolled.
- bool isPart0() { return getUnrollPart(*this) == 0; }
+ bool isPart0() const { return getUnrollPart(*this) == 0; }
VP_CLASSOF_IMPL(VPDef::VPScalarIVStepsSC)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 194874a..5319b8c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -437,9 +437,12 @@ static void addCanonicalIVRecipes(VPlan &Plan, VPBasicBlock *HeaderVPBB,
// We are about to replace the branch to exit the region. Remove the original
// BranchOnCond, if there is any.
+ DebugLoc LatchDL = DL;
if (!LatchVPBB->empty() &&
- match(&LatchVPBB->back(), m_BranchOnCond(m_VPValue())))
+ match(&LatchVPBB->back(), m_BranchOnCond(m_VPValue()))) {
+ LatchDL = LatchVPBB->getTerminator()->getDebugLoc();
LatchVPBB->getTerminator()->eraseFromParent();
+ }
VPBuilder Builder(LatchVPBB);
// Add a VPInstruction to increment the scalar canonical IV by VF * UF.
@@ -452,7 +455,8 @@ static void addCanonicalIVRecipes(VPlan &Plan, VPBasicBlock *HeaderVPBB,
// Add the BranchOnCount VPInstruction to the latch.
Builder.createNaryOp(VPInstruction::BranchOnCount,
- {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
+ {CanonicalIVIncrement, &Plan.getVectorTripCount()},
+ LatchDL);
}
void VPlanTransforms::prepareForVectorization(
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index f0cab79..fc8458c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -184,8 +184,7 @@ void VPPredicator::createSwitchEdgeMasks(VPInstruction *SI) {
VPValue *Cond = SI->getOperand(0);
VPBasicBlock *DefaultDst = cast<VPBasicBlock>(Src->getSuccessors()[0]);
MapVector<VPBasicBlock *, SmallVector<VPValue *>> Dst2Compares;
- for (const auto &[Idx, Succ] :
- enumerate(ArrayRef(Src->getSuccessors()).drop_front())) {
+ for (const auto &[Idx, Succ] : enumerate(drop_begin(Src->getSuccessors()))) {
VPBasicBlock *Dst = cast<VPBasicBlock>(Succ);
assert(!getEdgeMask(Src, Dst) && "Edge masks already created");
// Cases whose destination is the same as default are redundant and can
@@ -206,7 +205,7 @@ void VPPredicator::createSwitchEdgeMasks(VPInstruction *SI) {
// cases with destination == Dst are taken. Join the conditions for each
// case whose destination == Dst using an OR.
VPValue *Mask = Conds[0];
- for (VPValue *V : ArrayRef<VPValue *>(Conds).drop_front())
+ for (VPValue *V : drop_begin(Conds))
Mask = Builder.createOr(Mask, V);
if (SrcMask)
Mask = Builder.createLogicalAnd(SrcMask, Mask);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 1fbc3f3..0d6152b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -413,14 +413,14 @@ void VPSingleDefRecipe::dump() const { VPDef::dump(); }
template <unsigned PartOpIdx>
VPValue *
-VPUnrollPartAccessor<PartOpIdx>::getUnrollPartOperand(VPUser &U) const {
+VPUnrollPartAccessor<PartOpIdx>::getUnrollPartOperand(const VPUser &U) const {
if (U.getNumOperands() == PartOpIdx + 1)
return U.getOperand(PartOpIdx);
return nullptr;
}
template <unsigned PartOpIdx>
-unsigned VPUnrollPartAccessor<PartOpIdx>::getUnrollPart(VPUser &U) const {
+unsigned VPUnrollPartAccessor<PartOpIdx>::getUnrollPart(const VPUser &U) const {
if (auto *UnrollPartOp = getUnrollPartOperand(U))
return cast<ConstantInt>(UnrollPartOp->getLiveInIRValue())->getZExtValue();
return 0;
@@ -991,7 +991,13 @@ bool VPInstruction::isVectorToScalar() const {
}
bool VPInstruction::isSingleScalar() const {
- return getOpcode() == Instruction::PHI || isScalarCast();
+ switch (getOpcode()) {
+ case Instruction::PHI:
+ case VPInstruction::ExplicitVectorLength:
+ return true;
+ default:
+ return isScalarCast();
+ }
}
void VPInstruction::execute(VPTransformState &State) {
@@ -2411,42 +2417,6 @@ void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
}
#endif
-void VPBlendRecipe::execute(VPTransformState &State) {
- assert(isNormalized() && "Expected blend to be normalized!");
- // We know that all PHIs in non-header blocks are converted into
- // selects, so we don't have to worry about the insertion order and we
- // can just use the builder.
- // At this point we generate the predication tree. There may be
- // duplications since this is a simple recursive scan, but future
- // optimizations will clean it up.
-
- unsigned NumIncoming = getNumIncomingValues();
-
- // Generate a sequence of selects of the form:
- // SELECT(Mask3, In3,
- // SELECT(Mask2, In2,
- // SELECT(Mask1, In1,
- // In0)))
- // Note that Mask0 is never used: lanes for which no path reaches this phi and
- // are essentially undef are taken from In0.
- bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
- Value *Result = nullptr;
- for (unsigned In = 0; In < NumIncoming; ++In) {
- // We might have single edge PHIs (blocks) - use an identity
- // 'select' for the first PHI operand.
- Value *In0 = State.get(getIncomingValue(In), OnlyFirstLaneUsed);
- if (In == 0)
- Result = In0; // Initialize with the first incoming value.
- else {
- // Select between the current value and the previous incoming edge
- // based on the incoming mask.
- Value *Cond = State.get(getMask(In), OnlyFirstLaneUsed);
- Result = State.Builder.CreateSelect(Cond, In0, Result, "predphi");
- }
- }
- State.set(this, Result, OnlyFirstLaneUsed);
-}
-
InstructionCost VPBlendRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
// Handle cases where only the first lane is used the same way as the legacy
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index cb370fe..3d8e149 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -997,7 +997,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
// InstSimplifyFolder.
if (TypeSwitch<VPRecipeBase *, bool>(&R)
.Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe,
- VPReplicateRecipe>([&](auto *I) {
+ VPReplicateRecipe, VPWidenSelectRecipe>([&](auto *I) {
const DataLayout &DL =
Plan->getScalarHeader()->getIRBasicBlock()->getDataLayout();
Value *V = tryToFoldLiveIns(*I, I->getOpcode(), I->operands(), DL,
@@ -1883,9 +1883,7 @@ void VPlanTransforms::truncateToMinimalBitwidths(
}
}
-/// Remove BranchOnCond recipes with true or false conditions together with
-/// removing dead edges to their successors.
-static void removeBranchOnConst(VPlan &Plan) {
+void VPlanTransforms::removeBranchOnConst(VPlan &Plan) {
using namespace llvm::VPlanPatternMatch;
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_shallow(Plan.getEntry()))) {
@@ -1908,12 +1906,9 @@ static void removeBranchOnConst(VPlan &Plan) {
"There must be a single edge between VPBB and its successor");
// Values coming from VPBB into phi recipes of RemoveSucc are removed from
// these recipes.
- for (VPRecipeBase &R : RemovedSucc->phis()) {
- auto *Phi = cast<VPPhiAccessors>(&R);
- assert((!isa<VPIRPhi>(&R) || RemovedSucc->getNumPredecessors() == 1) &&
- "VPIRPhis must have a single predecessor");
- Phi->removeIncomingValueFor(VPBB);
- }
+ for (VPRecipeBase &R : RemovedSucc->phis())
+ cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
+
// Disconnect blocks and remove the terminator. RemovedSucc will be deleted
// automatically on VPlan destruction if it becomes unreachable.
VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
@@ -2711,6 +2706,18 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan,
continue;
}
+ // Expand VPBlendRecipe into VPInstruction::Select.
+ VPBuilder Builder(&R);
+ if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
+ VPValue *Select = Blend->getIncomingValue(0);
+ for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
+ Select = Builder.createSelect(Blend->getMask(I),
+ Blend->getIncomingValue(I), Select,
+ R.getDebugLoc(), "predphi");
+ Blend->replaceAllUsesWith(Select);
+ ToRemove.push_back(Blend);
+ }
+
if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
Expr->decompose();
ToRemove.push_back(Expr);
@@ -2724,7 +2731,6 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan,
// Expand WideIVStep.
auto *VPI = cast<VPInstruction>(&R);
- VPBuilder Builder(VPI);
Type *IVTy = TypeInfo.inferScalarType(VPI);
if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
Instruction::CastOps CastOp = IVTy->isFloatingPointTy()
@@ -3082,6 +3088,29 @@ void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
}
}
+void VPlanTransforms::materializeVectorTripCount(
+ VPlan &Plan, ElementCount BestVF, unsigned BestUF,
+ PredicatedScalarEvolution &PSE) {
+ assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
+ assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
+
+ VPValue *TC = Plan.getTripCount();
+ // Skip cases for which the trip count may be non-trivial to materialize.
+ if (!Plan.hasScalarTail() ||
+ Plan.getMiddleBlock()->getSingleSuccessor() ==
+ Plan.getScalarPreheader() ||
+ !TC->isLiveIn())
+ return;
+ // Materialize vector trip counts for constants early if it can simply
+ // be computed as (Original TC / VF * UF) * VF * UF.
+ ScalarEvolution &SE = *PSE.getSE();
+ auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
+ const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
+ auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
+ if (auto *NewC = dyn_cast<SCEVConstant>(VecTCScev))
+ Plan.getVectorTripCount().setUnderlyingValue(NewC->getValue());
+}
+
/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index ab189f6..d5af6cd 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -224,6 +224,10 @@ struct VPlanTransforms {
/// CanonicalIVTy as type for all un-typed live-ins in VPTypeAnalysis.
static void simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy);
+ /// Remove BranchOnCond recipes with true or false conditions together with
+ /// removing dead edges to their successors.
+ static void removeBranchOnConst(VPlan &Plan);
+
/// If there's a single exit block, optimize its phi recipes that use exiting
/// IV values by feeding them precomputed end values instead, possibly taken
/// one step backwards.
@@ -234,6 +238,12 @@ struct VPlanTransforms {
/// Add explicit broadcasts for live-ins and VPValues defined in \p Plan's entry block if they are used as vectors.
static void materializeBroadcasts(VPlan &Plan);
+ // Materialize vector trip counts for constants early if it can simply be
+ // computed as (Original TC / VF * UF) * VF * UF.
+ static void materializeVectorTripCount(VPlan &Plan, ElementCount BestVF,
+ unsigned BestUF,
+ PredicatedScalarEvolution &PSE);
+
/// Try to convert a plan with interleave groups with VF elements to a plan
/// with the interleave groups replaced by wide loads and stores processing VF
/// elements, if all transformed interleave groups access the full vector
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 82adc34..6252f4f 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -3174,6 +3174,55 @@ bool VectorCombine::foldCastFromReductions(Instruction &I) {
return true;
}
+/// Returns true if this ShuffleVectorInst eventually feeds into a
+/// vector reduction intrinsic (e.g., vector_reduce_add) by only following
+/// chains of shuffles and binary operators (in any combination/order).
+/// The search does not go deeper than the given Depth.
+static bool feedsIntoVectorReduction(ShuffleVectorInst *SVI) {
+ constexpr unsigned MaxVisited = 32;
+ SmallPtrSet<Instruction *, 8> Visited;
+ SmallVector<Instruction *, 4> WorkList;
+ bool FoundReduction = false;
+
+ WorkList.push_back(SVI);
+ while (!WorkList.empty()) {
+ Instruction *I = WorkList.pop_back_val();
+ for (User *U : I->users()) {
+ auto *UI = cast<Instruction>(U);
+ if (!UI || !Visited.insert(UI).second)
+ continue;
+ if (Visited.size() > MaxVisited)
+ return false;
+ if (auto *II = dyn_cast<IntrinsicInst>(UI)) {
+ // More than one reduction reached
+ if (FoundReduction)
+ return false;
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::vector_reduce_add:
+ case Intrinsic::vector_reduce_mul:
+ case Intrinsic::vector_reduce_and:
+ case Intrinsic::vector_reduce_or:
+ case Intrinsic::vector_reduce_xor:
+ case Intrinsic::vector_reduce_smin:
+ case Intrinsic::vector_reduce_smax:
+ case Intrinsic::vector_reduce_umin:
+ case Intrinsic::vector_reduce_umax:
+ FoundReduction = true;
+ continue;
+ default:
+ return false;
+ }
+ }
+
+ if (!isa<BinaryOperator>(UI) && !isa<ShuffleVectorInst>(UI))
+ return false;
+
+ WorkList.emplace_back(UI);
+ }
+ }
+ return FoundReduction;
+}
+
/// This method looks for groups of shuffles acting on binops, of the form:
/// %x = shuffle ...
/// %y = shuffle ...
@@ -3416,6 +3465,65 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, VT, Mask, CostKind);
};
+ unsigned ElementSize = VT->getElementType()->getPrimitiveSizeInBits();
+ unsigned MaxVectorSize =
+ TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector);
+ unsigned MaxElementsInVector = MaxVectorSize / ElementSize;
+ // When there are multiple shufflevector operations on the same input,
+ // especially when the vector length is larger than the register size,
+ // identical shuffle patterns may occur across different groups of elements.
+ // To avoid overestimating the cost by counting these repeated shuffles more
+ // than once, we only account for unique shuffle patterns. This adjustment
+ // prevents inflated costs in the cost model for wide vectors split into
+ // several register-sized groups.
+ std::set<SmallVector<int, 4>> UniqueShuffles;
+ auto AddShuffleMaskAdjustedCost = [&](InstructionCost C, ArrayRef<int> Mask) {
+ // Compute the cost for performing the shuffle over the full vector.
+ auto ShuffleCost =
+ TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, VT, Mask, CostKind);
+ unsigned NumFullVectors = Mask.size() / MaxElementsInVector;
+ if (NumFullVectors < 2)
+ return C + ShuffleCost;
+ SmallVector<int, 4> SubShuffle(MaxElementsInVector);
+ unsigned NumUniqueGroups = 0;
+ unsigned NumGroups = Mask.size() / MaxElementsInVector;
+ // For each group of MaxElementsInVector contiguous elements,
+ // collect their shuffle pattern and insert into the set of unique patterns.
+ for (unsigned I = 0; I < NumFullVectors; ++I) {
+ for (unsigned J = 0; J < MaxElementsInVector; ++J)
+ SubShuffle[J] = Mask[MaxElementsInVector * I + J];
+ if (UniqueShuffles.insert(SubShuffle).second)
+ NumUniqueGroups += 1;
+ }
+ return C + ShuffleCost * NumUniqueGroups / NumGroups;
+ };
+ auto AddShuffleAdjustedCost = [&](InstructionCost C, Instruction *I) {
+ auto *SV = dyn_cast<ShuffleVectorInst>(I);
+ if (!SV)
+ return C;
+ SmallVector<int, 16> Mask;
+ SV->getShuffleMask(Mask);
+ return AddShuffleMaskAdjustedCost(C, Mask);
+ };
+ // Check that input consists of ShuffleVectors applied to the same input
+ auto AllShufflesHaveSameOperands =
+ [](SmallPtrSetImpl<Instruction *> &InputShuffles) {
+ if (InputShuffles.size() < 2)
+ return false;
+ ShuffleVectorInst *FirstSV =
+ dyn_cast<ShuffleVectorInst>(*InputShuffles.begin());
+ if (!FirstSV)
+ return false;
+
+ Value *In0 = FirstSV->getOperand(0), *In1 = FirstSV->getOperand(1);
+ return std::all_of(
+ std::next(InputShuffles.begin()), InputShuffles.end(),
+ [&](Instruction *I) {
+ ShuffleVectorInst *SV = dyn_cast<ShuffleVectorInst>(I);
+ return SV && SV->getOperand(0) == In0 && SV->getOperand(1) == In1;
+ });
+ };
+
// Get the costs of the shuffles + binops before and after with the new
// shuffle masks.
InstructionCost CostBefore =
@@ -3423,8 +3531,14 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
TTI.getArithmeticInstrCost(Op1->getOpcode(), VT, CostKind);
CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(),
InstructionCost(0), AddShuffleCost);
- CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
- InstructionCost(0), AddShuffleCost);
+ if (AllShufflesHaveSameOperands(InputShuffles)) {
+ UniqueShuffles.clear();
+ CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
+ InstructionCost(0), AddShuffleAdjustedCost);
+ } else {
+ CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
+ InstructionCost(0), AddShuffleCost);
+ }
// The new binops will be unused for lanes past the used shuffle lengths.
// These types attempt to get the correct cost for that from the target.
@@ -3435,8 +3549,9 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
InstructionCost CostAfter =
TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT, CostKind) +
TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT, CostKind);
+ UniqueShuffles.clear();
CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(),
- InstructionCost(0), AddShuffleMaskCost);
+ InstructionCost(0), AddShuffleMaskAdjustedCost);
std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B});
CostAfter +=
std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(),
@@ -3445,7 +3560,8 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n");
LLVM_DEBUG(dbgs() << " CostBefore: " << CostBefore
<< " vs CostAfter: " << CostAfter << "\n");
- if (CostBefore <= CostAfter)
+ if (CostBefore < CostAfter ||
+ (CostBefore == CostAfter && !feedsIntoVectorReduction(SVI)))
return false;
// The cost model has passed, create the new instructions.