aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.h17
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.td156
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp26
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h108
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp59
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp35
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp22
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h5
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp18
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGISel.td3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp219
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h18
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp273
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h237
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp14
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td37
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp96
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructions.td12
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp188
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h5
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp33
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp240
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp146
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp84
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp9
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp22
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def7
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp146
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp35
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp253
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h39
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp107
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h16
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp45
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp7
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp24
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td11
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.cpp48
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.h40
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp84
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp135
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp17
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp90
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp17
-rw-r--r--llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp24
-rw-r--r--llvm/lib/Target/AMDGPU/BUFInstructions.td54
-rw-r--r--llvm/lib/Target/AMDGPU/CMakeLists.txt6
-rw-r--r--llvm/lib/Target/AMDGPU/DSInstructions.td21
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp11
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h2
-rw-r--r--llvm/lib/Target/AMDGPU/EvergreenInstructions.td1
-rw-r--r--llvm/lib/Target/AMDGPU/FLATInstructions.td40
-rw-r--r--llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp126
-rw-r--r--llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h9
-rw-r--r--llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/GCNRegPressure.cpp60
-rw-r--r--llvm/lib/Target/AMDGPU/GCNRegPressure.h10
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp63
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.h8
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.h47
-rw-r--r--llvm/lib/Target/AMDGPU/InstCombineTables.td10
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp99
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h3
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp26
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h5
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp7
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h6
-rw-r--r--llvm/lib/Target/AMDGPU/MIMGInstructions.td4
-rw-r--r--llvm/lib/Target/AMDGPU/R600.td21
-rw-r--r--llvm/lib/Target/AMDGPU/R600ISelLowering.cpp15
-rw-r--r--llvm/lib/Target/AMDGPU/R600InstrInfo.cpp10
-rw-r--r--llvm/lib/Target/AMDGPU/R600MCInstLower.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/SIDefines.h1
-rw-r--r--llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp47
-rw-r--r--llvm/lib/Target/AMDGPU/SIFoldOperands.cpp116
-rw-r--r--llvm/lib/Target/AMDGPU/SIFrameLowering.cpp33
-rw-r--r--llvm/lib/Target/AMDGPU/SIFrameLowering.h8
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp290
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.h4
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp162
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp440
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h37
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.td119
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td101
-rw-r--r--llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp72
-rw-r--r--llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp9
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp21
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h9
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp63
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineScheduler.h14
-rw-r--r--llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp1221
-rw-r--r--llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp25
-rw-r--r--llvm/lib/Target/AMDGPU/SIPostRABundler.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp124
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.h36
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.td119
-rw-r--r--llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp23
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp117
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h41
-rw-r--r--llvm/lib/Target/AMDGPU/VOP1Instructions.td53
-rw-r--r--llvm/lib/Target/AMDGPU/VOP2Instructions.td6
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3Instructions.td24
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3PInstructions.td31
-rw-r--r--llvm/lib/Target/AMDGPU/VOPCInstructions.td10
-rw-r--r--llvm/lib/Target/AMDGPU/VOPInstructions.td12
133 files changed, 4082 insertions, 3295 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index cd8b249..5df11a4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -69,7 +69,7 @@ FunctionPass *createAMDGPUPreloadKernArgPrologLegacyPass();
ModulePass *createAMDGPUPreloadKernelArgumentsLegacyPass(const TargetMachine *);
struct AMDGPUSimplifyLibCallsPass : PassInfoMixin<AMDGPUSimplifyLibCallsPass> {
- AMDGPUSimplifyLibCallsPass() {}
+ AMDGPUSimplifyLibCallsPass() = default;
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
};
@@ -298,6 +298,15 @@ private:
bool GlobalOpt;
};
+void initializeAMDGPULowerExecSyncLegacyPass(PassRegistry &);
+extern char &AMDGPULowerExecSyncLegacyPassID;
+ModulePass *createAMDGPULowerExecSyncLegacyPass();
+
+struct AMDGPULowerExecSyncPass : PassInfoMixin<AMDGPULowerExecSyncPass> {
+ AMDGPULowerExecSyncPass() {}
+ PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
void initializeAMDGPUSwLowerLDSLegacyPass(PassRegistry &);
extern char &AMDGPUSwLowerLDSLegacyPassID;
ModulePass *
@@ -371,13 +380,13 @@ public:
class AMDGPUAnnotateUniformValuesPass
: public PassInfoMixin<AMDGPUAnnotateUniformValuesPass> {
public:
- AMDGPUAnnotateUniformValuesPass() {}
+ AMDGPUAnnotateUniformValuesPass() = default;
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
};
class SIModeRegisterPass : public PassInfoMixin<SIModeRegisterPass> {
public:
- SIModeRegisterPass() {}
+ SIModeRegisterPass() = default;
PreservedAnalyses run(MachineFunction &F, MachineFunctionAnalysisManager &AM);
};
@@ -527,7 +536,7 @@ void initializeAMDGPUAAWrapperPassPass(PassRegistry&);
ImmutablePass *createAMDGPUExternalAAWrapperPass();
void initializeAMDGPUExternalAAWrapperPass(PassRegistry&);
-void initializeAMDGPUArgumentUsageInfoPass(PassRegistry &);
+void initializeAMDGPUArgumentUsageInfoWrapperLegacyPass(PassRegistry &);
ModulePass *createAMDGPUExportKernelRuntimeHandlesLegacyPass();
void initializeAMDGPUExportKernelRuntimeHandlesLegacyPass(PassRegistry &);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 54d94b1..3b14a82 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -901,6 +901,48 @@ def FeaturePkFmacF16Inst : SubtargetFeature<"pk-fmac-f16-inst",
"Has v_pk_fmac_f16 instruction"
>;
+def FeatureCubeInsts : SubtargetFeature<"cube-insts",
+ "HasCubeInsts",
+ "true",
+ "Has v_cube* instructions"
+>;
+
+def FeatureLerpInst : SubtargetFeature<"lerp-inst",
+ "HasLerpInst",
+ "true",
+ "Has v_lerp_u8 instruction"
+>;
+
+def FeatureSadInsts : SubtargetFeature<"sad-insts",
+ "HasSadInsts",
+ "true",
+ "Has v_sad* instructions"
+>;
+
+def FeatureQsadInsts : SubtargetFeature<"qsad-insts",
+ "HasQsadInsts",
+ "true",
+ "Has v_qsad* instructions"
+>;
+
+def FeatureCvtNormInsts : SubtargetFeature<"cvt-norm-insts",
+ "HasCvtNormInsts",
+ "true",
+ "Has v_cvt_norm* instructions"
+>;
+
+def FeatureCvtPkNormVOP2Insts : SubtargetFeature<"cvt-pknorm-vop2-insts",
+ "HasCvtPkNormVOP2Insts",
+ "true",
+ "Has v_cvt_pk_norm_*f32 instructions/Has v_cvt_pk_norm_*_f16 instructions"
+>;
+
+def FeatureCvtPkNormVOP3Insts : SubtargetFeature<"cvt-pknorm-vop3-insts",
+ "HasCvtPkNormVOP3Insts",
+ "true",
+ "Has v_cvt_pk_norm_*f32 instructions/Has v_cvt_pk_norm_*_f16 instructions"
+>;
+
def FeatureAtomicDsPkAdd16Insts : SubtargetFeature<"atomic-ds-pk-add-16-insts",
"HasAtomicDsPkAdd16Insts",
"true",
@@ -1494,7 +1536,8 @@ def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
FeatureTrigReducedRange, FeatureExtendedImageInsts, FeatureImageInsts,
FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts,
- FeatureVmemWriteVgprInOrder
+ FeatureVmemWriteVgprInOrder, FeatureCubeInsts, FeatureLerpInst,
+ FeatureSadInsts, FeatureCvtPkNormVOP2Insts
]
>;
@@ -1508,7 +1551,8 @@ def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
FeatureImageInsts, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts,
FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts,
- FeatureVmemWriteVgprInOrder
+ FeatureVmemWriteVgprInOrder, FeatureCubeInsts, FeatureLerpInst,
+ FeatureSadInsts, FeatureQsadInsts, FeatureCvtPkNormVOP2Insts
]
>;
@@ -1524,7 +1568,9 @@ def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts,
FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureFastDenormalF32,
FeatureUnalignedBufferAccess, FeatureImageInsts, FeatureGDS, FeatureGWS,
- FeatureDefaultComponentZero, FeatureVmemWriteVgprInOrder
+ FeatureDefaultComponentZero, FeatureVmemWriteVgprInOrder, FeatureCubeInsts,
+ FeatureLerpInst, FeatureSadInsts, FeatureQsadInsts,
+ FeatureCvtPkNormVOP2Insts
]
>;
@@ -1543,7 +1589,10 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK,
FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess,
FeatureUnalignedDSAccess, FeatureNegativeScratchOffsetBug, FeatureGWS,
- FeatureDefaultComponentZero,FeatureVmemWriteVgprInOrder, FeatureMemToLDSLoad
+ FeatureDefaultComponentZero,FeatureVmemWriteVgprInOrder, FeatureMemToLDSLoad,
+ FeatureCubeInsts, FeatureLerpInst, FeatureSadInsts, FeatureQsadInsts,
+ FeatureCvtNormInsts, FeatureCvtPkNormVOP2Insts,
+ FeatureCvtPkNormVOP3Insts
]
>;
@@ -1567,7 +1616,10 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
FeatureDefaultComponentZero, FeatureMaxHardClauseLength63,
FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts,
FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts,
- FeatureVmemWriteVgprInOrder, FeatureMemToLDSLoad
+ FeatureVmemWriteVgprInOrder, FeatureMemToLDSLoad, FeatureCubeInsts,
+ FeatureLerpInst, FeatureSadInsts, FeatureQsadInsts,
+ FeatureCvtNormInsts, FeatureCvtPkNormVOP2Insts,
+ FeatureCvtPkNormVOP3Insts
]
>;
@@ -1590,7 +1642,9 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11",
FeatureUnalignedDSAccess, FeatureGDS, FeatureGWS,
FeatureDefaultComponentZero, FeatureMaxHardClauseLength32,
FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts,
- FeatureVmemWriteVgprInOrder
+ FeatureVmemWriteVgprInOrder, FeatureCubeInsts, FeatureLerpInst,
+ FeatureSadInsts, FeatureQsadInsts, FeatureCvtNormInsts,
+ FeatureCvtPkNormVOP2Insts, FeatureCvtPkNormVOP3Insts
]
>;
@@ -2069,13 +2123,20 @@ def FeatureISAVersion12 : FeatureSet<
FeatureMemoryAtomicFAddF32DenormalSupport,
FeatureBVHDualAndBVH8Insts,
FeatureWaitsBeforeSystemScopeStores,
+ FeatureD16Writes32BitVgpr,
+ FeatureCubeInsts,
+ FeatureLerpInst,
+ FeatureSadInsts,
+ FeatureQsadInsts,
+ FeatureCvtNormInsts,
+ FeatureCvtPkNormVOP2Insts,
+ FeatureCvtPkNormVOP3Insts
]>;
-def FeatureISAVersion12_50 : FeatureSet<
+def FeatureISAVersion12_50_Common : FeatureSet<
[FeatureGFX12,
FeatureGFX1250Insts,
FeatureRequiresAlignedVGPRs,
- FeatureAddressableLocalMemorySize327680,
FeatureCuMode,
Feature1024AddressableVGPRs,
Feature64BitLiterals,
@@ -2143,8 +2204,20 @@ def FeatureISAVersion12_50 : FeatureSet<
FeatureSupportsXNACK,
FeatureXNACK,
FeatureClusters,
+ FeatureD16Writes32BitVgpr,
+ FeatureCubeInsts,
+ FeatureLerpInst,
+ FeatureSadInsts,
+ FeatureQsadInsts,
+ FeatureCvtNormInsts,
+ FeatureCvtPkNormVOP2Insts,
+ FeatureCvtPkNormVOP3Insts
]>;
+def FeatureISAVersion12_50 : FeatureSet<
+ !listconcat(FeatureISAVersion12_50_Common.Features,
+ [FeatureAddressableLocalMemorySize327680])>;
+
def FeatureISAVersion12_51 : FeatureSet<
!listconcat(FeatureISAVersion12_50.Features,
[FeatureDPALU_DPP])>;
@@ -2523,6 +2596,10 @@ def HasAtomicFMinFMaxF64FlatInsts :
Predicate<"Subtarget->hasAtomicFMinFMaxF64FlatInsts()">,
AssemblerPredicate<(any_of FeatureAtomicFMinFMaxF64FlatInsts)>;
+def HasAtomicCondSubClampFlatInsts :
+ Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12">,
+ AssemblerPredicate<(all_of FeatureGFX12Insts)>;
+
def HasLdsAtomicAddF64 :
Predicate<"Subtarget->hasLdsAtomicAddF64()">,
AssemblerPredicate<(any_of FeatureGFX90AInsts, FeatureGFX1250Insts)>;
@@ -2814,6 +2891,27 @@ def HasFP8Insts : Predicate<"Subtarget->hasFP8Insts()">,
def HasFP8ConversionInsts : Predicate<"Subtarget->hasFP8ConversionInsts()">,
AssemblerPredicate<(all_of FeatureFP8ConversionInsts)>;
+def HasCubeInsts : Predicate<"Subtarget->hasCubeInsts()">,
+ AssemblerPredicate<(all_of FeatureCubeInsts)>;
+
+def HasLerpInst : Predicate<"Subtarget->hasLerpInst()">,
+ AssemblerPredicate<(all_of FeatureLerpInst)>;
+
+def HasSadInsts : Predicate<"Subtarget->hasSadInsts()">,
+ AssemblerPredicate<(all_of FeatureSadInsts)>;
+
+def HasQsadInsts : Predicate<"Subtarget->hasQsadInsts()">,
+ AssemblerPredicate<(all_of FeatureQsadInsts)>;
+
+def HasCvtNormInsts : Predicate<"Subtarget->hasCvtNormInsts()">,
+ AssemblerPredicate<(all_of FeatureCvtNormInsts)>;
+
+def HasCvtPkNormVOP2Insts : Predicate<"Subtarget->hasCvtPkNormVOP2Insts()">,
+ AssemblerPredicate<(all_of FeatureCvtPkNormVOP2Insts)>;
+
+def HasCvtPkNormVOP3Insts : Predicate<"Subtarget->hasCvtPkNormVOP3Insts()">,
+ AssemblerPredicate<(all_of FeatureCvtPkNormVOP3Insts)>;
+
def HasFP8E5M3Insts : Predicate<"Subtarget->hasFP8E5M3Insts()">,
AssemblerPredicate<(all_of FeatureFP8E5M3Insts)>;
@@ -2829,9 +2927,16 @@ def HasMadMacF32Insts : Predicate<"Subtarget->hasMadMacF32Insts()">,
def HasFmaLegacy32 : Predicate<"Subtarget->hasGFX10_3Insts()">,
AssemblerPredicate<(any_of FeatureGFX10_3Insts)>;
+def HasFmacLegacy32 : Predicate<"Subtarget->hasGFX10_3Insts() && Subtarget->getGeneration() < AMDGPUSubtarget::GFX12">,
+ AssemblerPredicate<(all_of FeatureGFX10_3Insts, (not FeatureGFX12Insts))>;
+
def HasAtomicDsPkAdd16Insts : Predicate<"Subtarget->hasAtomicDsPkAdd16Insts()">,
AssemblerPredicate<(any_of FeatureAtomicDsPkAdd16Insts)>;
+def HasAtomicDsCondSubClampInsts :
+ Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12">,
+ AssemblerPredicate<(all_of FeatureGFX12Insts)>;
+
def HasAtomicFlatPkAdd16Insts : Predicate<"Subtarget->hasAtomicFlatPkAdd16Insts()">,
AssemblerPredicate<(any_of FeatureAtomicFlatPkAdd16Insts)>;
@@ -2974,15 +3079,46 @@ def HasSetPrioIncWgInst : Predicate<"Subtarget->hasSetPrioIncWgInst()">,
def NeedsAlignedVGPRs : Predicate<"Subtarget->needsAlignedVGPRs()">,
AssemblerPredicate<(all_of FeatureRequiresAlignedVGPRs)>;
+def NotNeedsAlignedVGPRs : Predicate<"!Subtarget->needsAlignedVGPRs()">,
+ AssemblerPredicate<(all_of (not FeatureRequiresAlignedVGPRs))>;
+
+def isWave32 : Predicate<"Subtarget->isWave32()">,
+ AssemblerPredicate <(any_of FeatureWavefrontSize32,
+ FeatureAssemblerPermissiveWavesize)>;
+def isWave64 : Predicate<"Subtarget->isWave64()">,
+ AssemblerPredicate <(any_of FeatureWavefrontSize64,
+ FeatureAssemblerPermissiveWavesize)>;
+
+def isWave32Strict : Predicate<"Subtarget->isWave32()">,
+ AssemblerPredicate <(all_of FeatureWavefrontSize32)>;
+def isWave64Strict : Predicate<"Subtarget->isWave64()">,
+ AssemblerPredicate <(all_of FeatureWavefrontSize64)>;
+
//===----------------------------------------------------------------------===//
// HwModes
//===----------------------------------------------------------------------===//
-// gfx90a-gfx950. Has AGPRs, and also the align2 VGPR/AGPR requirement
+defvar DefaultMode_Wave64 = DefaultMode;
+defvar DefaultMode_Wave32 = HwMode<[isWave32, NotNeedsAlignedVGPRs]>;
+
+// gfx90a-gfx950. Has AGPRs, and also the align2 VGPR/AGPR requirement. Implied
+// wave64.
def AVAlign2LoadStoreMode : HwMode<[HasMAIInsts, NeedsAlignedVGPRs]>;
// gfx1250, has alignment requirement but no AGPRs.
-def AlignedVGPRNoAGPRMode : HwMode<[NotHasMAIInsts, NeedsAlignedVGPRs]>;
+def AlignedVGPRNoAGPRMode_Wave32 : HwMode<[NotHasMAIInsts, NeedsAlignedVGPRs, isWave32Strict]>;
+def AlignedVGPRNoAGPRMode_Wave64 : HwMode<[NotHasMAIInsts, NeedsAlignedVGPRs, isWave64Strict]>;
+
+// FIXME: This should be able to only define a separate hwmode that
+// only depends on wavesize for just ValueTypes. These use different
+// HwMode namespaces. If we don't define the full set of modes used
+// for RegClassByHwMode, tablegen crashes for some reason
+def WaveSizeVT : ValueTypeByHwMode<[
+ DefaultMode_Wave64,
+ AVAlign2LoadStoreMode,
+ AlignedVGPRNoAGPRMode_Wave64,
+ DefaultMode_Wave32,
+ AlignedVGPRNoAGPRMode_Wave32], [i64, i64, i64, i32, i32]>;
// Include AMDGPU TD files
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
index c28c25f..2bdadda 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
@@ -65,7 +65,7 @@ recursivelyVisitUsers(GlobalValue &GV,
continue;
if (Instruction *I = dyn_cast<Instruction>(U)) {
- Function *F = I->getParent()->getParent();
+ Function *F = I->getFunction();
if (!AMDGPU::isEntryFunctionCC(F->getCallingConv())) {
// FIXME: This is a horrible hack. We should always respect noinline,
// and just let us hit the error when we can't handle this.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
index dda8033..346e257 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
@@ -19,7 +19,7 @@ using namespace llvm;
#define DEBUG_TYPE "amdgpu-argument-reg-usage-info"
-INITIALIZE_PASS(AMDGPUArgumentUsageInfo, DEBUG_TYPE,
+INITIALIZE_PASS(AMDGPUArgumentUsageInfoWrapperLegacy, DEBUG_TYPE,
"Argument Register Usage Information Storage", false, true)
void ArgDescriptor::print(raw_ostream &OS,
@@ -42,7 +42,7 @@ void ArgDescriptor::print(raw_ostream &OS,
OS << '\n';
}
-char AMDGPUArgumentUsageInfo::ID = 0;
+char AMDGPUArgumentUsageInfoWrapperLegacy::ID = 0;
const AMDGPUFunctionArgInfo AMDGPUArgumentUsageInfo::ExternFunctionInfo{};
@@ -50,15 +50,6 @@ const AMDGPUFunctionArgInfo AMDGPUArgumentUsageInfo::ExternFunctionInfo{};
const AMDGPUFunctionArgInfo AMDGPUArgumentUsageInfo::FixedABIFunctionInfo
= AMDGPUFunctionArgInfo::fixedABILayout();
-bool AMDGPUArgumentUsageInfo::doInitialization(Module &M) {
- return false;
-}
-
-bool AMDGPUArgumentUsageInfo::doFinalization(Module &M) {
- ArgInfoMap.clear();
- return false;
-}
-
// TODO: Print preload kernargs?
void AMDGPUArgumentUsageInfo::print(raw_ostream &OS, const Module *M) const {
for (const auto &FI : ArgInfoMap) {
@@ -86,6 +77,12 @@ void AMDGPUArgumentUsageInfo::print(raw_ostream &OS, const Module *M) const {
}
}
+bool AMDGPUArgumentUsageInfo::invalidate(Module &M, const PreservedAnalyses &PA,
+ ModuleAnalysisManager::Invalidator &) {
+ auto PAC = PA.getChecker<AMDGPUArgumentUsageAnalysis>();
+ return !PAC.preservedWhenStateless();
+}
+
std::tuple<const ArgDescriptor *, const TargetRegisterClass *, LLT>
AMDGPUFunctionArgInfo::getPreloadedValue(
AMDGPUFunctionArgInfo::PreloadedValue Value) const {
@@ -191,3 +188,10 @@ AMDGPUArgumentUsageInfo::lookupFuncArgInfo(const Function &F) const {
return FixedABIFunctionInfo;
return I->second;
}
+
+AnalysisKey AMDGPUArgumentUsageAnalysis::Key;
+
+AMDGPUArgumentUsageInfo
+AMDGPUArgumentUsageAnalysis::run(Module &M, ModuleAnalysisManager &) {
+ return AMDGPUArgumentUsageInfo();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
index 1064e57..f41739a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -12,10 +12,15 @@
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/CodeGen/Register.h"
+#include "llvm/IR/PassManager.h"
#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include <variant>
namespace llvm {
+void initializeAMDGPUArgumentUsageInfoWrapperLegacyPass(PassRegistry &);
+
class Function;
class LLT;
class raw_ostream;
@@ -27,55 +32,44 @@ private:
friend struct AMDGPUFunctionArgInfo;
friend class AMDGPUArgumentUsageInfo;
- union {
- MCRegister Reg;
- unsigned StackOffset;
- };
+ std::variant<std::monostate, MCRegister, unsigned> Val;
// Bitmask to locate argument within the register.
unsigned Mask;
- bool IsStack : 1;
- bool IsSet : 1;
-
public:
- ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u, bool IsStack = false,
- bool IsSet = false)
- : Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {}
+ ArgDescriptor(unsigned Mask = ~0u) : Mask(Mask) {}
static ArgDescriptor createRegister(Register Reg, unsigned Mask = ~0u) {
- return ArgDescriptor(Reg, Mask, false, true);
+ ArgDescriptor Ret(Mask);
+ Ret.Val = Reg.asMCReg();
+ return Ret;
}
static ArgDescriptor createStack(unsigned Offset, unsigned Mask = ~0u) {
- return ArgDescriptor(Offset, Mask, true, true);
+ ArgDescriptor Ret(Mask);
+ Ret.Val = Offset;
+ return Ret;
}
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask) {
- return ArgDescriptor(Arg.Reg, Mask, Arg.IsStack, Arg.IsSet);
+ // Copy the descriptor, then change the mask.
+ ArgDescriptor Ret(Arg);
+ Ret.Mask = Mask;
+ return Ret;
}
- bool isSet() const {
- return IsSet;
- }
+ bool isSet() const { return !std::holds_alternative<std::monostate>(Val); }
explicit operator bool() const {
return isSet();
}
- bool isRegister() const {
- return !IsStack;
- }
+ bool isRegister() const { return std::holds_alternative<MCRegister>(Val); }
- MCRegister getRegister() const {
- assert(!IsStack);
- return Reg;
- }
+ MCRegister getRegister() const { return std::get<MCRegister>(Val); }
- unsigned getStackOffset() const {
- assert(IsStack);
- return StackOffset;
- }
+ unsigned getStackOffset() const { return std::get<unsigned>(Val); }
unsigned getMask() const {
// None of the target SGPRs or VGPRs are expected to have a 'zero' mask.
@@ -96,7 +90,7 @@ inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) {
}
struct KernArgPreloadDescriptor : public ArgDescriptor {
- KernArgPreloadDescriptor() {}
+ KernArgPreloadDescriptor() = default;
SmallVector<MCRegister> Regs;
};
@@ -178,32 +172,70 @@ struct AMDGPUFunctionArgInfo {
static AMDGPUFunctionArgInfo fixedABILayout();
};
-class AMDGPUArgumentUsageInfo : public ImmutablePass {
+class AMDGPUArgumentUsageInfo {
private:
DenseMap<const Function *, AMDGPUFunctionArgInfo> ArgInfoMap;
public:
- static char ID;
-
static const AMDGPUFunctionArgInfo ExternFunctionInfo;
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo;
- AMDGPUArgumentUsageInfo() : ImmutablePass(ID) { }
+ void print(raw_ostream &OS, const Module *M = nullptr) const;
+
+ void clear() { ArgInfoMap.clear(); }
+
+ void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo) {
+ ArgInfoMap[&F] = ArgInfo;
+ }
+
+ const AMDGPUFunctionArgInfo &lookupFuncArgInfo(const Function &F) const;
+
+ bool invalidate(Module &M, const PreservedAnalyses &PA,
+ ModuleAnalysisManager::Invalidator &Inv);
+};
+
+class AMDGPUArgumentUsageInfoWrapperLegacy : public ImmutablePass {
+ std::unique_ptr<AMDGPUArgumentUsageInfo> AUIP;
+
+public:
+ static char ID;
+
+ AMDGPUArgumentUsageInfoWrapperLegacy() : ImmutablePass(ID) {
+ initializeAMDGPUArgumentUsageInfoWrapperLegacyPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ AMDGPUArgumentUsageInfo &getArgUsageInfo() { return *AUIP; }
+ const AMDGPUArgumentUsageInfo &getArgUsageInfo() const { return *AUIP; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesAll();
}
- bool doInitialization(Module &M) override;
- bool doFinalization(Module &M) override;
+ bool doInitialization(Module &M) override {
+ AUIP = std::make_unique<AMDGPUArgumentUsageInfo>();
+ return false;
+ }
- void print(raw_ostream &OS, const Module *M = nullptr) const override;
+ bool doFinalization(Module &M) override {
+ AUIP->clear();
+ return false;
+ }
- void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo) {
- ArgInfoMap[&F] = ArgInfo;
+ void print(raw_ostream &OS, const Module *M = nullptr) const override {
+ AUIP->print(OS, M);
}
+};
- const AMDGPUFunctionArgInfo &lookupFuncArgInfo(const Function &F) const;
+class AMDGPUArgumentUsageAnalysis
+ : public AnalysisInfoMixin<AMDGPUArgumentUsageAnalysis> {
+ friend AnalysisInfoMixin<AMDGPUArgumentUsageAnalysis>;
+ static AnalysisKey Key;
+
+public:
+ using Result = AMDGPUArgumentUsageInfo;
+
+ AMDGPUArgumentUsageInfo run(Module &M, ModuleAnalysisManager &);
};
} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp
index 93732a7..9af3b05 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp
@@ -208,7 +208,8 @@ void instrumentAddress(Module &M, IRBuilder<> &IRB, Instruction *OrigIns,
Value *NumBits = IRB.CreateTypeSize(IntptrTy, TypeStoreSize);
Value *Size = IRB.CreateLShr(NumBits, ConstantInt::get(IntptrTy, 3));
Value *AddrLong = IRB.CreatePtrToInt(Addr, IntptrTy);
- Value *SizeMinusOne = IRB.CreateAdd(Size, ConstantInt::get(IntptrTy, -1));
+ Value *SizeMinusOne =
+ IRB.CreateAdd(Size, ConstantInt::getAllOnesValue(IntptrTy));
Value *LastByte =
IRB.CreateIntToPtr(IRB.CreateAdd(AddrLong, SizeMinusOne), AddrTy);
instrumentAddressImpl(M, IRB, OrigIns, InsertBefore, Addr, {}, 8, IsWrite,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 29f8f9b..d0c86a7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -358,6 +358,32 @@ bool AMDGPUAsmPrinter::doInitialization(Module &M) {
return AsmPrinter::doInitialization(M);
}
+/// Mimics GCNSubtarget::computeOccupancy for MCExpr.
+///
+/// Remove dependency on GCNSubtarget and depend only only the necessary values
+/// for said occupancy computation. Should match computeOccupancy implementation
+/// without passing \p STM on.
+const AMDGPUMCExpr *createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs,
+ const MCExpr *NumVGPRs,
+ unsigned DynamicVGPRBlockSize,
+ const GCNSubtarget &STM, MCContext &Ctx) {
+ unsigned MaxWaves = IsaInfo::getMaxWavesPerEU(&STM);
+ unsigned Granule = IsaInfo::getVGPRAllocGranule(&STM, DynamicVGPRBlockSize);
+ unsigned TargetTotalNumVGPRs = IsaInfo::getTotalNumVGPRs(&STM);
+ unsigned Generation = STM.getGeneration();
+
+ auto CreateExpr = [&Ctx](unsigned Value) {
+ return MCConstantExpr::create(Value, Ctx);
+ };
+
+ return AMDGPUMCExpr::create(AMDGPUMCExpr::AGVK_Occupancy,
+ {CreateExpr(MaxWaves), CreateExpr(Granule),
+ CreateExpr(TargetTotalNumVGPRs),
+ CreateExpr(Generation), CreateExpr(InitOcc),
+ NumSGPRs, NumVGPRs},
+ Ctx);
+}
+
void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
if (F.isDeclaration() || !AMDGPU::isModuleEntryFunctionCC(F.getCallingConv()))
return;
@@ -459,7 +485,7 @@ void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
MaxWaves, MFI.getDynamicVGPRBlockSize())});
uint64_t NumSGPRsForWavesPerEU = std::max(
{NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(MaxWaves)});
- const MCExpr *OccupancyExpr = AMDGPUMCExpr::createOccupancy(
+ const MCExpr *OccupancyExpr = createOccupancy(
STM.getOccupancyWithWorkGroupSizes(*MF).second,
MCConstantExpr::create(NumSGPRsForWavesPerEU, OutContext),
MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext),
@@ -508,9 +534,9 @@ bool AMDGPUAsmPrinter::doFinalization(Module &M) {
MCSectionELF *MaxGPRSection =
OutContext.getELFSection(".AMDGPU.gpr_maximums", ELF::SHT_PROGBITS, 0);
OutStreamer->switchSection(MaxGPRSection);
- getTargetStreamer()->EmitMCResourceMaximums(RI.getMaxVGPRSymbol(OutContext),
- RI.getMaxAGPRSymbol(OutContext),
- RI.getMaxSGPRSymbol(OutContext));
+ getTargetStreamer()->EmitMCResourceMaximums(
+ RI.getMaxVGPRSymbol(OutContext), RI.getMaxAGPRSymbol(OutContext),
+ RI.getMaxSGPRSymbol(OutContext), RI.getMaxNamedBarrierSymbol(OutContext));
OutStreamer->popSection();
for (Function &F : M.functions())
@@ -1160,21 +1186,20 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
// Make clamp modifier on NaN input returns 0.
ProgInfo.DX10Clamp = Mode.DX10Clamp;
- unsigned LDSAlignShift;
- if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize327680)) {
- // LDS is allocated in 256 dword blocks.
- LDSAlignShift = 10;
- } else if (STM.getFeatureBits().test(
- FeatureAddressableLocalMemorySize163840)) {
- // LDS is allocated in 320 dword blocks.
+ unsigned LDSAlignShift = 8;
+ switch (getLdsDwGranularity(STM)) {
+ case 512:
+ case 320:
LDSAlignShift = 11;
- } else if (STM.getFeatureBits().test(
- FeatureAddressableLocalMemorySize65536)) {
- // LDS is allocated in 128 dword blocks.
+ break;
+ case 128:
LDSAlignShift = 9;
- } else {
- // LDS is allocated in 64 dword blocks.
+ break;
+ case 64:
LDSAlignShift = 8;
+ break;
+ default:
+ llvm_unreachable("invald LDS block size");
}
ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
@@ -1270,7 +1295,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT,
amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT);
- ProgInfo.Occupancy = AMDGPUMCExpr::createOccupancy(
+ ProgInfo.Occupancy = createOccupancy(
STM.computeOccupancy(F, ProgInfo.LDSSize).second,
ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU,
MFI->getDynamicVGPRBlockSize(), STM, Ctx);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index 0a163f8..784ee36 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -589,7 +589,7 @@ std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively(
// return the next active lane
auto *Mask = B.CreateShl(ConstantInt::get(WaveTy, 1), FF1);
- auto *InverseMask = B.CreateXor(Mask, ConstantInt::get(WaveTy, -1));
+ auto *InverseMask = B.CreateXor(Mask, ConstantInt::getAllOnesValue(WaveTy));
auto *NewActiveBits = B.CreateAnd(ActiveBits, InverseMask);
ActiveBits->addIncoming(NewActiveBits, ComputeLoop);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 9907c88f..d3505cf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -38,9 +38,10 @@ enum ImplicitArgumentPositions {
#define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS,
enum ImplicitArgumentMask {
- NOT_IMPLICIT_INPUT = 0,
+ UNKNOWN_INTRINSIC = 0,
#include "AMDGPUAttributes.def"
- ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1
+ ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1,
+ NOT_IMPLICIT_INPUT
};
#define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str},
@@ -115,7 +116,7 @@ intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit,
NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
return QUEUE_PTR;
default:
- return NOT_IMPLICIT_INPUT;
+ return UNKNOWN_INTRINSIC;
}
}
@@ -534,6 +535,21 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
ImplicitArgumentMask AttrMask =
intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit,
HasApertureRegs, SupportsGetDoorbellID, COV);
+
+ if (AttrMask == UNKNOWN_INTRINSIC) {
+ // Assume not-nocallback intrinsics may invoke a function which accesses
+ // implicit arguments.
+ //
+ // FIXME: This isn't really the correct check. We want to ensure it
+ // isn't calling any function that may use implicit arguments regardless
+ // of whether it's internal to the module or not.
+ //
+ // TODO: Ignoring callsite attributes.
+ if (!Callee->hasFnAttribute(Attribute::NoCallback))
+ return indicatePessimisticFixpoint();
+ continue;
+ }
+
if (AttrMask != NOT_IMPLICIT_INPUT) {
if ((IsNonEntryFunc || !NonKernelOnly))
removeAssumedBits(AttrMask);
@@ -1357,7 +1373,10 @@ struct AAAMDGPUMinAGPRAlloc
default:
// Some intrinsics may use AGPRs, but if we have a choice, we are not
// required to use AGPRs.
- return true;
+
+ // Assume !nocallback intrinsics may call a function which requires
+ // AGPRs.
+ return CB.hasFnAttr(Attribute::NoCallback);
}
// TODO: Handle callsite attributes
@@ -1555,7 +1574,7 @@ private:
AMDGPU::ClusterDimsAttr Attr;
- static constexpr const char AttrName[] = "amdgpu-cluster-dims";
+ static constexpr char AttrName[] = "amdgpu-cluster-dims";
};
AAAMDGPUClusterDims &
@@ -1584,7 +1603,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
&AAAMDGPUMinAGPRAlloc::ID, &AACallEdges::ID, &AAPointerInfo::ID,
&AAPotentialConstantValues::ID, &AAUnderlyingObjects::ID,
&AANoAliasAddrSpace::ID, &AAAddressSpace::ID, &AAIndirectCallInfo::ID,
- &AAAMDGPUClusterDims::ID});
+ &AAAMDGPUClusterDims::ID, &AAAlign::ID});
AttributorConfig AC(CGUpdater);
AC.IsClosedWorldModule = Options.IsClosedWorld;
@@ -1642,6 +1661,10 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
if (Ptr) {
A.getOrCreateAAFor<AAAddressSpace>(IRPosition::value(*Ptr));
A.getOrCreateAAFor<AANoAliasAddrSpace>(IRPosition::value(*Ptr));
+ if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Ptr)) {
+ if (II->getIntrinsicID() == Intrinsic::amdgcn_make_buffer_rsrc)
+ A.getOrCreateAAFor<AAAlign>(IRPosition::value(*Ptr));
+ }
}
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp
index 30a1f05..2e586ea 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp
@@ -27,8 +27,17 @@ using namespace llvm;
namespace {
class BarrierLatency : public ScheduleDAGMutation {
+private:
+ SmallSet<SyncScope::ID, 4> IgnoredScopes;
+
public:
- BarrierLatency() = default;
+ BarrierLatency(MachineFunction *MF) {
+ LLVMContext &Context = MF->getFunction().getContext();
+ IgnoredScopes.insert(SyncScope::SingleThread);
+ IgnoredScopes.insert(Context.getOrInsertSyncScopeID("wavefront"));
+ IgnoredScopes.insert(Context.getOrInsertSyncScopeID("wavefront-one-as"));
+ IgnoredScopes.insert(Context.getOrInsertSyncScopeID("singlethread-one-as"));
+ }
void apply(ScheduleDAGInstrs *DAG) override;
};
@@ -40,8 +49,11 @@ void BarrierLatency::apply(ScheduleDAGInstrs *DAG) {
continue;
// Update latency on barrier edges of ATOMIC_FENCE.
- // We don't consider the scope of the fence or type of instruction
- // involved in the barrier edge.
+ // Ignore scopes not expected to have any latency.
+ SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
+ if (IgnoredScopes.contains(SSID))
+ continue;
+
for (SDep &PredDep : SU.Preds) {
if (!PredDep.isBarrier())
continue;
@@ -68,6 +80,6 @@ void BarrierLatency::apply(ScheduleDAGInstrs *DAG) {
} // end namespace
std::unique_ptr<ScheduleDAGMutation>
-llvm::createAMDGPUBarrierLatencyDAGMutation() {
- return std::make_unique<BarrierLatency>();
+llvm::createAMDGPUBarrierLatencyDAGMutation(MachineFunction *MF) {
+ return std::make_unique<BarrierLatency>(MF);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h
index c23f0b9..547cd2a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h
@@ -14,7 +14,10 @@
namespace llvm {
-std::unique_ptr<ScheduleDAGMutation> createAMDGPUBarrierLatencyDAGMutation();
+class MachineFunction;
+
+std::unique_ptr<ScheduleDAGMutation>
+createAMDGPUBarrierLatencyDAGMutation(MachineFunction *MF);
} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 7afadde..682f1aa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -21,6 +21,7 @@
#include "llvm/CodeGen/FunctionLoweringInfo.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/PseudoSourceValueManager.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#define DEBUG_TYPE "amdgpu-call-lowering"
@@ -414,7 +415,8 @@ void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, ArgInfo &OrigArg,
MachineFunction &MF = B.getMF();
const Function &F = MF.getFunction();
const DataLayout &DL = F.getDataLayout();
- MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
+ const SITargetLowering &TLI = *getTLI<SITargetLowering>();
+ MachinePointerInfo PtrInfo = TLI.getKernargSegmentPtrInfo(MF);
LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 8e35ba7..71ea9ef 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -143,14 +143,6 @@ public:
bool canBreakPHINode(const PHINode &I);
- /// \returns True if binary operation \p I is a signed binary operation, false
- /// otherwise.
- bool isSigned(const BinaryOperator &I) const;
-
- /// \returns True if the condition of 'select' operation \p I comes from a
- /// signed 'icmp' operation, false otherwise.
- bool isSigned(const SelectInst &I) const;
-
/// Return true if \p T is a legal scalar floating point type.
bool isLegalFloatingTy(const Type *T) const;
@@ -304,16 +296,6 @@ bool AMDGPUCodeGenPrepareImpl::run() {
return MadeChange;
}
-bool AMDGPUCodeGenPrepareImpl::isSigned(const BinaryOperator &I) const {
- return I.getOpcode() == Instruction::AShr ||
- I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem;
-}
-
-bool AMDGPUCodeGenPrepareImpl::isSigned(const SelectInst &I) const {
- return isa<ICmpInst>(I.getOperand(0)) &&
- cast<ICmpInst>(I.getOperand(0))->isSigned();
-}
-
bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type *Ty) const {
return Ty->isFloatTy() || Ty->isDoubleTy() ||
(Ty->isHalfTy() && ST.has16BitInsts());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index bb4bf74..55ce4f1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -288,6 +288,8 @@ def : GINodeEquiv<G_AMDGPU_TBUFFER_STORE_FORMAT_D16, SItbuffer_store_d16>;
// FIXME: Check MMO is atomic
def : GINodeEquiv<G_ATOMICRMW_UINC_WRAP, atomic_load_uinc_wrap_glue>;
def : GINodeEquiv<G_ATOMICRMW_UDEC_WRAP, atomic_load_udec_wrap_glue>;
+def : GINodeEquiv<G_ATOMICRMW_USUB_COND, atomic_load_usub_cond_glue>;
+def : GINodeEquiv<G_ATOMICRMW_USUB_SAT, atomic_load_usub_sat_glue>;
def : GINodeEquiv<G_ATOMICRMW_FMIN, atomic_load_fmin_glue>;
def : GINodeEquiv<G_ATOMICRMW_FMAX, atomic_load_fmax_glue>;
@@ -308,6 +310,7 @@ def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMIN, SIbuffer_atomic_fmin>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMAX, SIbuffer_atomic_fmax>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_CMPSWAP, SIbuffer_atomic_cmpswap>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32, SIbuffer_atomic_cond_sub_u32>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32, SIbuffer_atomic_csub>;
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD, SIsbuffer_load>;
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_SBYTE, SIsbuffer_load_byte>;
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_UBYTE, SIsbuffer_load_ubyte>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 5700468..85addb13a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -1968,7 +1968,7 @@ private:
int NumBits = 0;
auto TRI = TII->getRegisterInfo();
- auto &MRI = MI->getParent()->getParent()->getRegInfo();
+ auto &MRI = MI->getMF()->getRegInfo();
for (auto &Elt : Collection) {
auto Op = Elt->getInstr()->getOperand(0);
auto Size =
@@ -2183,7 +2183,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
// Interleave MFMA with DS_READ prefetch
- for (unsigned I = 0; I < DSRCount - 4; ++I) {
+ for (unsigned I = 4; I < DSRCount; ++I) {
SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII);
SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
@@ -2196,7 +2196,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
// Phase 2a: Loop carried dependency with V_PERM
// Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they
// depend on. Interleave MFMA to keep XDL unit busy throughout.
- for (unsigned I = 0; I < DSWWithPermCount - DSWWithSharedVMEMCount; ++I) {
+ for (unsigned I = DSWWithSharedVMEMCount; I < DSWWithPermCount; ++I) {
SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
@@ -2233,7 +2233,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
// Phase 2b: Loop carried dependency without V_PERM
// Schedule DS_WRITE as closely as possible to the VMEM_READ they depend on.
// Interleave MFMA to keep XDL unit busy throughout.
- for (unsigned I = 0; I < DSWCount - DSWWithPermCount; I++) {
+ for (unsigned I = DSWWithPermCount; I < DSWCount; I++) {
SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
index aff7096..0688f07 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
@@ -11,7 +11,6 @@
#include "llvm/CodeGen/ScheduleDAGMutation.h"
#include <memory>
-#include <vector>
namespace llvm {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index b8b419d..d0835a0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -134,7 +134,7 @@ static SDValue stripExtractLoElt(SDValue In) {
INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISelLegacy, "amdgpu-isel",
"AMDGPU DAG->DAG Pattern Instruction Selection", false,
false)
-INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo)
+INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfoWrapperLegacy)
INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysisLegacy)
INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
#ifdef EXPENSIVE_CHECKS
@@ -238,7 +238,7 @@ bool AMDGPUDAGToDAGISelLegacy::runOnMachineFunction(MachineFunction &MF) {
}
void AMDGPUDAGToDAGISelLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
- AU.addRequired<AMDGPUArgumentUsageInfo>();
+ AU.addRequired<AMDGPUArgumentUsageInfoWrapperLegacy>();
AU.addRequired<UniformityInfoWrapperPass>();
#ifdef EXPENSIVE_CHECKS
AU.addRequired<DominatorTreeWrapperPass>();
@@ -470,6 +470,24 @@ MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
}
+SDNode *AMDGPUDAGToDAGISel::packConstantV2I16(const SDNode *N,
+ SelectionDAG &DAG) const {
+ // TODO: Handle undef as zero
+
+ assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2);
+ uint32_t LHSVal, RHSVal;
+ if (getConstantValue(N->getOperand(0), LHSVal) &&
+ getConstantValue(N->getOperand(1), RHSVal)) {
+ SDLoc SL(N);
+ uint32_t K = (LHSVal & 0xffff) | (RHSVal << 16);
+ return DAG.getMachineNode(
+ isVGPRImm(N) ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32, SL,
+ N->getValueType(0), DAG.getTargetConstant(K, SL, MVT::i32));
+ }
+
+ return nullptr;
+}
+
void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
EVT VT = N->getValueType(0);
unsigned NumVectorElts = VT.getVectorNumElements();
@@ -708,10 +726,14 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
break;
}
+ const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
assert(VT.getVectorElementType().bitsEq(MVT::i32));
- unsigned RegClassID =
- SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID();
- SelectBuildVector(N, RegClassID);
+ const TargetRegisterClass *RegClass =
+ N->isDivergent()
+ ? TRI->getDefaultVectorSuperClassForBitWidth(NumVectorElts * 32)
+ : SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32);
+
+ SelectBuildVector(N, RegClass->getID());
return;
}
case ISD::VECTOR_SHUFFLE:
@@ -1828,72 +1850,83 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
isFlatScratchBaseLegal(Addr))) {
int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
- const SIInstrInfo *TII = Subtarget->getInstrInfo();
- if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
- Addr = N0;
- OffsetVal = COffsetVal;
- } else {
- // If the offset doesn't fit, put the low bits into the offset field and
- // add the rest.
- //
- // For a FLAT instruction the hardware decides whether to access
- // global/scratch/shared memory based on the high bits of vaddr,
- // ignoring the offset field, so we have to ensure that when we add
- // remainder to vaddr it still points into the same underlying object.
- // The easiest way to do that is to make sure that we split the offset
- // into two pieces that are both >= 0 or both <= 0.
-
- SDLoc DL(N);
- uint64_t RemainderOffset;
-
- std::tie(OffsetVal, RemainderOffset) =
- TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
-
- SDValue AddOffsetLo =
- getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
- SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
-
- if (Addr.getValueType().getSizeInBits() == 32) {
- SmallVector<SDValue, 3> Opnds;
- Opnds.push_back(N0);
- Opnds.push_back(AddOffsetLo);
- unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
- if (Subtarget->hasAddNoCarry()) {
- AddOp = AMDGPU::V_ADD_U32_e64;
- Opnds.push_back(Clamp);
- }
- Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
+ // Adding the offset to the base address in a FLAT instruction must not
+ // change the memory aperture in which the address falls. Therefore we can
+ // only fold offsets from inbounds GEPs into FLAT instructions.
+ bool IsInBounds =
+ Addr.getOpcode() == ISD::PTRADD && Addr->getFlags().hasInBounds();
+ if (COffsetVal == 0 || FlatVariant != SIInstrFlags::FLAT || IsInBounds) {
+ const SIInstrInfo *TII = Subtarget->getInstrInfo();
+ if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
+ Addr = N0;
+ OffsetVal = COffsetVal;
} else {
- // TODO: Should this try to use a scalar add pseudo if the base address
- // is uniform and saddr is usable?
- SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
- SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
-
- SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
- DL, MVT::i32, N0, Sub0);
- SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
- DL, MVT::i32, N0, Sub1);
-
- SDValue AddOffsetHi =
- getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
-
- SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
-
- SDNode *Add =
- CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
- {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
-
- SDNode *Addc = CurDAG->getMachineNode(
- AMDGPU::V_ADDC_U32_e64, DL, VTs,
- {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
-
- SDValue RegSequenceArgs[] = {
- CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
- SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
-
- Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
- MVT::i64, RegSequenceArgs),
- 0);
+ // If the offset doesn't fit, put the low bits into the offset field
+ // and add the rest.
+ //
+ // For a FLAT instruction the hardware decides whether to access
+ // global/scratch/shared memory based on the high bits of vaddr,
+ // ignoring the offset field, so we have to ensure that when we add
+ // remainder to vaddr it still points into the same underlying object.
+ // The easiest way to do that is to make sure that we split the offset
+ // into two pieces that are both >= 0 or both <= 0.
+
+ SDLoc DL(N);
+ uint64_t RemainderOffset;
+
+ std::tie(OffsetVal, RemainderOffset) =
+ TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
+
+ SDValue AddOffsetLo =
+ getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
+ SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
+
+ if (Addr.getValueType().getSizeInBits() == 32) {
+ SmallVector<SDValue, 3> Opnds;
+ Opnds.push_back(N0);
+ Opnds.push_back(AddOffsetLo);
+ unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
+ if (Subtarget->hasAddNoCarry()) {
+ AddOp = AMDGPU::V_ADD_U32_e64;
+ Opnds.push_back(Clamp);
+ }
+ Addr =
+ SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
+ } else {
+ // TODO: Should this try to use a scalar add pseudo if the base
+ // address is uniform and saddr is usable?
+ SDValue Sub0 =
+ CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
+ SDValue Sub1 =
+ CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
+
+ SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+ DL, MVT::i32, N0, Sub0);
+ SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+ DL, MVT::i32, N0, Sub1);
+
+ SDValue AddOffsetHi =
+ getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
+
+ SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
+
+ SDNode *Add =
+ CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
+ {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
+
+ SDNode *Addc = CurDAG->getMachineNode(
+ AMDGPU::V_ADDC_U32_e64, DL, VTs,
+ {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
+
+ SDValue RegSequenceArgs[] = {
+ CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL,
+ MVT::i32),
+ SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
+
+ Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
+ MVT::i64, RegSequenceArgs),
+ 0);
+ }
}
}
}
@@ -3047,9 +3080,38 @@ void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
const unsigned Opc = gwsIntrinToOpcode(IntrID);
+
+ const MCInstrDesc &InstrDesc = TII->get(Opc);
+ int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
+
+ const TargetRegisterClass *DataRC = TII->getRegClass(InstrDesc, Data0Idx);
+
SmallVector<SDValue, 5> Ops;
- if (HasVSrc)
- Ops.push_back(N->getOperand(2));
+ if (HasVSrc) {
+ const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
+
+ SDValue Data = N->getOperand(2);
+ MVT DataVT = Data.getValueType().getSimpleVT();
+ if (TRI->isTypeLegalForClass(*DataRC, DataVT)) {
+ // Normal 32-bit case.
+ Ops.push_back(N->getOperand(2));
+ } else {
+ // Operand is really 32-bits, but requires 64-bit alignment, so use the
+ // even aligned 64-bit register class.
+ const SDValue RegSeqOps[] = {
+ CurDAG->getTargetConstant(DataRC->getID(), SL, MVT::i32), Data,
+ CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
+ SDValue(
+ CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL, MVT::i32),
+ 0),
+ CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32)};
+
+ Ops.push_back(SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,
+ SL, MVT::v2i32, RegSeqOps),
+ 0));
+ }
+ }
+
Ops.push_back(OffsetField);
Ops.push_back(Chain);
@@ -4387,16 +4449,23 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {
const auto *Ld = cast<LoadSDNode>(N);
-
const MachineMemOperand *MMO = Ld->getMemOperand();
- if (N->isDivergent() && !AMDGPU::isUniformMMO(MMO))
+
+ // FIXME: We ought to able able to take the direct isDivergent result. We
+ // cannot rely on the MMO for a uniformity check, and should stop using
+ // it. This is a hack for 2 ways that the IR divergence analysis is superior
+ // to the DAG divergence: Recognizing shift-of-workitem-id as always
+ // uniform, and isSingleLaneExecution. These should be handled in the DAG
+ // version, and then this can be dropped.
+ if (Ld->isDivergent() && !AMDGPU::isUniformMMO(MMO))
return false;
return MMO->getSize().hasValue() &&
Ld->getAlign() >=
Align(std::min(MMO->getSize().getValue().getKnownMinValue(),
uint64_t(4))) &&
- ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
+ (MMO->isInvariant() ||
+ (Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
(Subtarget->getScalarizeGlobalBehavior() &&
Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 4fa0d3f..a86b754 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -14,6 +14,7 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUISELDAGTODAG_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUISELDAGTODAG_H
+#include "AMDGPUSelectionDAGInfo.h"
#include "GCNSubtarget.h"
#include "SIMachineFunctionInfo.h"
#include "SIModeRegisterDefaults.h"
@@ -45,21 +46,6 @@ static inline bool getConstantValue(SDValue N, uint32_t &Out) {
return false;
}
-// TODO: Handle undef as zero
-static inline SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG) {
- assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2);
- uint32_t LHSVal, RHSVal;
- if (getConstantValue(N->getOperand(0), LHSVal) &&
- getConstantValue(N->getOperand(1), RHSVal)) {
- SDLoc SL(N);
- uint32_t K = (LHSVal & 0xffff) | (RHSVal << 16);
- return DAG.getMachineNode(AMDGPU::S_MOV_B32, SL, N->getValueType(0),
- DAG.getTargetConstant(K, SL, MVT::i32));
- }
-
- return nullptr;
-}
-
/// AMDGPU specific code to select AMDGPU machine instructions for
/// SelectionDAG operations.
class AMDGPUDAGToDAGISel : public SelectionDAGISel {
@@ -115,6 +101,8 @@ private:
MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const;
+ SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG) const;
+
SDNode *glueCopyToOp(SDNode *N, SDValue NewChain, SDValue Glue) const;
SDNode *glueCopyToM0(SDNode *N, SDValue Val) const;
SDNode *glueCopyToM0LDSInit(SDNode *N) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 1b559a6..ff17833 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -17,6 +17,7 @@
#include "AMDGPUInstrInfo.h"
#include "AMDGPUMachineFunction.h"
#include "AMDGPUMemoryUtils.h"
+#include "AMDGPUSelectionDAGInfo.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
@@ -59,8 +60,9 @@ unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
}
AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
- const AMDGPUSubtarget &STI)
- : TargetLowering(TM), Subtarget(&STI) {
+ const TargetSubtargetInfo &STI,
+ const AMDGPUSubtarget &AMDGPUSTI)
+ : TargetLowering(TM, STI), Subtarget(&AMDGPUSTI) {
// Always lower memset, memcpy, and memmove intrinsics to load/store
// instructions, rather then generating calls to memset, mempcy or memmove.
MaxStoresPerMemset = MaxStoresPerMemsetOptSize = ~0U;
@@ -336,6 +338,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Expand);
setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
+ setTruncStoreAction(MVT::v6f32, MVT::v6f16, Expand);
setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Expand);
setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
setTruncStoreAction(MVT::v16f32, MVT::v16bf16, Expand);
@@ -502,9 +505,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
// The hardware supports 32-bit FSHR, but not FSHL.
setOperationAction(ISD::FSHR, MVT::i32, Legal);
- // The hardware supports 32-bit ROTR, but not ROTL.
- setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand);
- setOperationAction(ISD::ROTR, MVT::i64, Expand);
+ setOperationAction({ISD::ROTL, ISD::ROTR}, {MVT::i32, MVT::i64}, Expand);
setOperationAction({ISD::MULHU, ISD::MULHS}, MVT::i16, Expand);
@@ -1216,7 +1217,7 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
const SmallVectorImpl<ISD::InputArg> &Ins) const {
const MachineFunction &MF = State.getMachineFunction();
const Function &Fn = MF.getFunction();
- LLVMContext &Ctx = Fn.getParent()->getContext();
+ LLVMContext &Ctx = Fn.getContext();
const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset();
CallingConv::ID CC = Fn.getCallingConv();
@@ -1248,7 +1249,8 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
SmallVector<EVT, 16> ValueVTs;
SmallVector<uint64_t, 16> Offsets;
- ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
+ ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, /*MemVTs=*/nullptr,
+ &Offsets, ArgOffset);
for (unsigned Value = 0, NumValues = ValueVTs.size();
Value != NumValues; ++Value) {
@@ -1409,7 +1411,12 @@ SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI,
InVals.push_back(DAG.getPOISON(Arg.VT));
}
- return DAG.getEntryNode();
+ // FIXME: Hack because R600 doesn't handle callseq pseudos yet.
+ if (getTargetMachine().getTargetTriple().getArch() == Triple::r600)
+ return CLI.Chain;
+
+ SDValue Chain = DAG.getCALLSEQ_START(CLI.Chain, 0, 0, CLI.DL);
+ return DAG.getCALLSEQ_END(Chain, 0, 0, /*InGlue=*/SDValue(), CLI.DL);
}
SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
@@ -1885,14 +1892,14 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
Align BaseAlign = Load->getAlign();
Align HiAlign = commonAlignment(BaseAlign, Size);
- SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
- Load->getChain(), BasePtr, SrcValue, LoMemVT,
- BaseAlign, Load->getMemOperand()->getFlags());
+ SDValue LoLoad = DAG.getExtLoad(
+ Load->getExtensionType(), SL, LoVT, Load->getChain(), BasePtr, SrcValue,
+ LoMemVT, BaseAlign, Load->getMemOperand()->getFlags(), Load->getAAInfo());
SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size));
- SDValue HiLoad =
- DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
- HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
- HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
+ SDValue HiLoad = DAG.getExtLoad(
+ Load->getExtensionType(), SL, HiVT, Load->getChain(), HiPtr,
+ SrcValue.getWithOffset(LoMemVT.getStoreSize()), HiMemVT, HiAlign,
+ Load->getMemOperand()->getFlags(), Load->getAAInfo());
SDValue Join;
if (LoVT == HiVT) {
@@ -1980,10 +1987,10 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
SDValue LoStore =
DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
- Store->getMemOperand()->getFlags());
- SDValue HiStore =
- DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
- HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
+ Store->getMemOperand()->getFlags(), Store->getAAInfo());
+ SDValue HiStore = DAG.getTruncStore(
+ Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size), HiMemVT, HiAlign,
+ Store->getMemOperand()->getFlags(), Store->getAAInfo());
return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
}
@@ -2764,7 +2771,6 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op,
EVT VT = Op.getValueType();
SDNodeFlags Flags = Op->getFlags();
SDLoc DL(Op);
-
const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
@@ -2803,7 +2809,9 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op,
SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT);
SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT);
-
+ // This adds correction terms for which contraction may lead to an increase
+ // in the error of the approximation, so disable it.
+ Flags.setAllowContract(false);
R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags);
SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags);
SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags);
@@ -2826,7 +2834,9 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op,
SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst);
SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt);
SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags);
-
+ // This adds correction terms for which contraction may lead to an increase
+ // in the error of the approximation, so disable it.
+ Flags.setAllowContract(false);
SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags);
SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags);
SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags);
@@ -2950,19 +2960,28 @@ SDValue AMDGPUTargetLowering::lowerFEXP2(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags);
}
+SDValue AMDGPUTargetLowering::lowerFEXPUnsafeImpl(SDValue X, const SDLoc &SL,
+ SelectionDAG &DAG,
+ SDNodeFlags Flags,
+ bool IsExp10) const {
+ // exp(x) -> exp2(M_LOG2E_F * x);
+ // exp10(x) -> exp2(log2(10) * x);
+ EVT VT = X.getValueType();
+ SDValue Const =
+ DAG.getConstantFP(IsExp10 ? 0x1.a934f0p+1f : numbers::log2e, SL, VT);
+
+ SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Const, Flags);
+ return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
+ : (unsigned)ISD::FEXP2,
+ SL, VT, Mul, Flags);
+}
+
SDValue AMDGPUTargetLowering::lowerFEXPUnsafe(SDValue X, const SDLoc &SL,
SelectionDAG &DAG,
SDNodeFlags Flags) const {
EVT VT = X.getValueType();
- const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);
-
- if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
- // exp2(M_LOG2E_F * f);
- SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Log2E, Flags);
- return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
- : (unsigned)ISD::FEXP2,
- SL, VT, Mul, Flags);
- }
+ if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags))
+ return lowerFEXPUnsafeImpl(X, SL, DAG, Flags, /*IsExp10=*/false);
EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
@@ -2976,6 +2995,7 @@ SDValue AMDGPUTargetLowering::lowerFEXPUnsafe(SDValue X, const SDLoc &SL,
SDValue AdjustedX =
DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
+ const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);
SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags);
SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags);
@@ -2994,6 +3014,7 @@ SDValue AMDGPUTargetLowering::lowerFEXP10Unsafe(SDValue X, const SDLoc &SL,
SelectionDAG &DAG,
SDNodeFlags Flags) const {
const EVT VT = X.getValueType();
+
const unsigned Exp2Op = VT == MVT::f32 ? static_cast<unsigned>(AMDGPUISD::EXP)
: static_cast<unsigned>(ISD::FEXP2);
@@ -3050,33 +3071,32 @@ SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
SDNodeFlags Flags = Op->getFlags();
const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
- if (VT.getScalarType() == MVT::f16) {
- // v_exp_f16 (fmul x, log2e)
- if (allowApproxFunc(DAG, Flags)) // TODO: Does this really require fast?
- return lowerFEXPUnsafe(X, SL, DAG, Flags);
+ // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
+ // library behavior. Also, is known-not-daz source sufficient?
+ if (allowApproxFunc(DAG, Flags)) { // TODO: Does this really require fast?
+ return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
+ : lowerFEXPUnsafe(X, SL, DAG, Flags);
+ }
+ if (VT.getScalarType() == MVT::f16) {
if (VT.isVector())
return SDValue();
+ // Nothing in half is a denormal when promoted to f32.
+ //
// exp(f16 x) ->
// fptrunc (v_exp_f32 (fmul (fpext x), log2e))
-
- // Nothing in half is a denormal when promoted to f32.
+ //
+ // exp10(f16 x) ->
+ // fptrunc (v_exp_f32 (fmul (fpext x), log2(10)))
SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags);
- SDValue Lowered = lowerFEXPUnsafe(Ext, SL, DAG, Flags);
+ SDValue Lowered = lowerFEXPUnsafeImpl(Ext, SL, DAG, Flags, IsExp10);
return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered,
DAG.getTargetConstant(0, SL, MVT::i32), Flags);
}
assert(VT == MVT::f32);
- // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
- // library behavior. Also, is known-not-daz source sufficient?
- if (allowApproxFunc(DAG, Flags)) {
- return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
- : lowerFEXPUnsafe(X, SL, DAG, Flags);
- }
-
// Algorithm:
//
// e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
@@ -5649,169 +5669,6 @@ uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
return getImplicitParameterOffset(MFI->getExplicitKernArgSize(), Param);
}
-#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
-
-const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
- switch ((AMDGPUISD::NodeType)Opcode) {
- case AMDGPUISD::FIRST_NUMBER: break;
- // AMDIL DAG nodes
- NODE_NAME_CASE(BRANCH_COND);
-
- // AMDGPU DAG nodes
- NODE_NAME_CASE(IF)
- NODE_NAME_CASE(ELSE)
- NODE_NAME_CASE(LOOP)
- NODE_NAME_CASE(CALL)
- NODE_NAME_CASE(TC_RETURN)
- NODE_NAME_CASE(TC_RETURN_GFX)
- NODE_NAME_CASE(TC_RETURN_GFX_WholeWave)
- NODE_NAME_CASE(TC_RETURN_CHAIN)
- NODE_NAME_CASE(TC_RETURN_CHAIN_DVGPR)
- NODE_NAME_CASE(TRAP)
- NODE_NAME_CASE(RET_GLUE)
- NODE_NAME_CASE(WAVE_ADDRESS)
- NODE_NAME_CASE(RETURN_TO_EPILOG)
- NODE_NAME_CASE(ENDPGM)
- NODE_NAME_CASE(ENDPGM_TRAP)
- NODE_NAME_CASE(SIMULATED_TRAP)
- NODE_NAME_CASE(DWORDADDR)
- NODE_NAME_CASE(FRACT)
- NODE_NAME_CASE(SETCC)
- NODE_NAME_CASE(DENORM_MODE)
- NODE_NAME_CASE(FMA_W_CHAIN)
- NODE_NAME_CASE(FMUL_W_CHAIN)
- NODE_NAME_CASE(CLAMP)
- NODE_NAME_CASE(COS_HW)
- NODE_NAME_CASE(SIN_HW)
- NODE_NAME_CASE(FMAX_LEGACY)
- NODE_NAME_CASE(FMIN_LEGACY)
- NODE_NAME_CASE(FMAX3)
- NODE_NAME_CASE(SMAX3)
- NODE_NAME_CASE(UMAX3)
- NODE_NAME_CASE(FMIN3)
- NODE_NAME_CASE(SMIN3)
- NODE_NAME_CASE(UMIN3)
- NODE_NAME_CASE(FMED3)
- NODE_NAME_CASE(SMED3)
- NODE_NAME_CASE(UMED3)
- NODE_NAME_CASE(FMAXIMUM3)
- NODE_NAME_CASE(FMINIMUM3)
- NODE_NAME_CASE(FDOT2)
- NODE_NAME_CASE(URECIP)
- NODE_NAME_CASE(DIV_SCALE)
- NODE_NAME_CASE(DIV_FMAS)
- NODE_NAME_CASE(DIV_FIXUP)
- NODE_NAME_CASE(FMAD_FTZ)
- NODE_NAME_CASE(RCP)
- NODE_NAME_CASE(RSQ)
- NODE_NAME_CASE(RCP_LEGACY)
- NODE_NAME_CASE(RCP_IFLAG)
- NODE_NAME_CASE(LOG)
- NODE_NAME_CASE(EXP)
- NODE_NAME_CASE(FMUL_LEGACY)
- NODE_NAME_CASE(RSQ_CLAMP)
- NODE_NAME_CASE(FP_CLASS)
- NODE_NAME_CASE(DOT4)
- NODE_NAME_CASE(CARRY)
- NODE_NAME_CASE(BORROW)
- NODE_NAME_CASE(BFE_U32)
- NODE_NAME_CASE(BFE_I32)
- NODE_NAME_CASE(BFI)
- NODE_NAME_CASE(BFM)
- NODE_NAME_CASE(FFBH_U32)
- NODE_NAME_CASE(FFBH_I32)
- NODE_NAME_CASE(FFBL_B32)
- NODE_NAME_CASE(MUL_U24)
- NODE_NAME_CASE(MUL_I24)
- NODE_NAME_CASE(MULHI_U24)
- NODE_NAME_CASE(MULHI_I24)
- NODE_NAME_CASE(MAD_U24)
- NODE_NAME_CASE(MAD_I24)
- NODE_NAME_CASE(MAD_I64_I32)
- NODE_NAME_CASE(MAD_U64_U32)
- NODE_NAME_CASE(PERM)
- NODE_NAME_CASE(TEXTURE_FETCH)
- NODE_NAME_CASE(R600_EXPORT)
- NODE_NAME_CASE(CONST_ADDRESS)
- NODE_NAME_CASE(REGISTER_LOAD)
- NODE_NAME_CASE(REGISTER_STORE)
- NODE_NAME_CASE(CVT_F32_UBYTE0)
- NODE_NAME_CASE(CVT_F32_UBYTE1)
- NODE_NAME_CASE(CVT_F32_UBYTE2)
- NODE_NAME_CASE(CVT_F32_UBYTE3)
- NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
- NODE_NAME_CASE(CVT_PKNORM_I16_F32)
- NODE_NAME_CASE(CVT_PKNORM_U16_F32)
- NODE_NAME_CASE(CVT_PK_I16_I32)
- NODE_NAME_CASE(CVT_PK_U16_U32)
- NODE_NAME_CASE(FP_TO_FP16)
- NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
- NODE_NAME_CASE(CONST_DATA_PTR)
- NODE_NAME_CASE(PC_ADD_REL_OFFSET)
- NODE_NAME_CASE(PC_ADD_REL_OFFSET64)
- NODE_NAME_CASE(LDS)
- NODE_NAME_CASE(DUMMY_CHAIN)
- NODE_NAME_CASE(LOAD_D16_HI)
- NODE_NAME_CASE(LOAD_D16_LO)
- NODE_NAME_CASE(LOAD_D16_HI_I8)
- NODE_NAME_CASE(LOAD_D16_HI_U8)
- NODE_NAME_CASE(LOAD_D16_LO_I8)
- NODE_NAME_CASE(LOAD_D16_LO_U8)
- NODE_NAME_CASE(STORE_MSKOR)
- NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
- NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)
- NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
- NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
- NODE_NAME_CASE(DS_ORDERED_COUNT)
- NODE_NAME_CASE(ATOMIC_CMP_SWAP)
- NODE_NAME_CASE(BUFFER_LOAD)
- NODE_NAME_CASE(BUFFER_LOAD_UBYTE)
- NODE_NAME_CASE(BUFFER_LOAD_USHORT)
- NODE_NAME_CASE(BUFFER_LOAD_BYTE)
- NODE_NAME_CASE(BUFFER_LOAD_SHORT)
- NODE_NAME_CASE(BUFFER_LOAD_TFE)
- NODE_NAME_CASE(BUFFER_LOAD_UBYTE_TFE)
- NODE_NAME_CASE(BUFFER_LOAD_USHORT_TFE)
- NODE_NAME_CASE(BUFFER_LOAD_BYTE_TFE)
- NODE_NAME_CASE(BUFFER_LOAD_SHORT_TFE)
- NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
- NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE)
- NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
- NODE_NAME_CASE(SBUFFER_LOAD)
- NODE_NAME_CASE(SBUFFER_LOAD_BYTE)
- NODE_NAME_CASE(SBUFFER_LOAD_UBYTE)
- NODE_NAME_CASE(SBUFFER_LOAD_SHORT)
- NODE_NAME_CASE(SBUFFER_LOAD_USHORT)
- NODE_NAME_CASE(SBUFFER_PREFETCH_DATA)
- NODE_NAME_CASE(BUFFER_STORE)
- NODE_NAME_CASE(BUFFER_STORE_BYTE)
- NODE_NAME_CASE(BUFFER_STORE_SHORT)
- NODE_NAME_CASE(BUFFER_STORE_FORMAT)
- NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
- NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)
- NODE_NAME_CASE(BUFFER_ATOMIC_ADD)
- NODE_NAME_CASE(BUFFER_ATOMIC_SUB)
- NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)
- NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)
- NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)
- NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)
- NODE_NAME_CASE(BUFFER_ATOMIC_AND)
- NODE_NAME_CASE(BUFFER_ATOMIC_OR)
- NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
- NODE_NAME_CASE(BUFFER_ATOMIC_INC)
- NODE_NAME_CASE(BUFFER_ATOMIC_DEC)
- NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
- NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)
- NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
- NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
- NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
- NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32)
- NODE_NAME_CASE(WHOLE_WAVE_SETUP)
- NODE_NAME_CASE(WHOLE_WAVE_RETURN)
- }
- return nullptr;
-}
-
SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
SelectionDAG &DAG, int Enabled,
int &RefinementSteps,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index bdaf486..10ae816 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -78,6 +78,9 @@ protected:
bool IsLog10, SDNodeFlags Flags) const;
SDValue lowerFEXP2(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerFEXPUnsafeImpl(SDValue Op, const SDLoc &SL, SelectionDAG &DAG,
+ SDNodeFlags Flags, bool IsExp10) const;
+
SDValue lowerFEXPUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG,
SDNodeFlags Flags) const;
SDValue lowerFEXP10Unsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG,
@@ -180,7 +183,8 @@ protected:
const SmallVectorImpl<ISD::InputArg> &Ins) const;
public:
- AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI);
+ AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI,
+ const AMDGPUSubtarget &AMDGPUSTI);
bool mayIgnoreSignedZero(SDValue Op) const;
@@ -280,8 +284,6 @@ public:
SDValue RHS, SDValue True, SDValue False,
SDValue CC, DAGCombinerInfo &DCI) const;
- const char* getTargetNodeName(unsigned Opcode) const override;
-
// FIXME: Turn off MergeConsecutiveStores() before Instruction Selection for
// AMDGPU. Commit r319036,
// (https://github.com/llvm/llvm-project/commit/db77e57ea86d941a4262ef60261692f4cb6893e6)
@@ -406,235 +408,6 @@ public:
}
};
-namespace AMDGPUISD {
-
-enum NodeType : unsigned {
- // AMDIL ISD Opcodes
- FIRST_NUMBER = ISD::BUILTIN_OP_END,
- BRANCH_COND,
- // End AMDIL ISD Opcodes
-
- // Function call.
- CALL,
- TC_RETURN,
- TC_RETURN_GFX,
- TC_RETURN_GFX_WholeWave,
- TC_RETURN_CHAIN,
- TC_RETURN_CHAIN_DVGPR,
- TRAP,
-
- // Masked control flow nodes.
- IF,
- ELSE,
- LOOP,
-
- // A uniform kernel return that terminates the wavefront.
- ENDPGM,
-
- // s_endpgm, but we may want to insert it in the middle of the block.
- ENDPGM_TRAP,
-
- // "s_trap 2" equivalent on hardware that does not support it.
- SIMULATED_TRAP,
-
- // Return to a shader part's epilog code.
- RETURN_TO_EPILOG,
-
- // Return with values from a non-entry function.
- RET_GLUE,
-
- // Convert a unswizzled wave uniform stack address to an address compatible
- // with a vector offset for use in stack access.
- WAVE_ADDRESS,
-
- DWORDADDR,
- FRACT,
-
- /// CLAMP value between 0.0 and 1.0. NaN clamped to 0, following clamp output
- /// modifier behavior with dx10_enable.
- CLAMP,
-
- // This is SETCC with the full mask result which is used for a compare with a
- // result bit per item in the wavefront.
- SETCC,
-
- DENORM_MODE,
-
- // FP ops with input and output chain.
- FMA_W_CHAIN,
- FMUL_W_CHAIN,
-
- // SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi.
- // Denormals handled on some parts.
- COS_HW,
- SIN_HW,
- FMAX_LEGACY,
- FMIN_LEGACY,
-
- FMAX3,
- SMAX3,
- UMAX3,
- FMIN3,
- SMIN3,
- UMIN3,
- FMED3,
- SMED3,
- UMED3,
- FMAXIMUM3,
- FMINIMUM3,
- FDOT2,
- URECIP,
- DIV_SCALE,
- DIV_FMAS,
- DIV_FIXUP,
- // For emitting ISD::FMAD when f32 denormals are enabled because mac/mad is
- // treated as an illegal operation.
- FMAD_FTZ,
-
- // RCP, RSQ - For f32, 1 ULP max error, no denormal handling.
- // For f64, max error 2^29 ULP, handles denormals.
- RCP,
- RSQ,
- RCP_LEGACY,
- RCP_IFLAG,
-
- // log2, no denormal handling for f32.
- LOG,
-
- // exp2, no denormal handling for f32.
- EXP,
-
- FMUL_LEGACY,
- RSQ_CLAMP,
- FP_CLASS,
- DOT4,
- CARRY,
- BORROW,
- BFE_U32, // Extract range of bits with zero extension to 32-bits.
- BFE_I32, // Extract range of bits with sign extension to 32-bits.
- BFI, // (src0 & src1) | (~src0 & src2)
- BFM, // Insert a range of bits into a 32-bit word.
- FFBH_U32, // ctlz with -1 if input is zero.
- FFBH_I32,
- FFBL_B32, // cttz with -1 if input is zero.
- MUL_U24,
- MUL_I24,
- MULHI_U24,
- MULHI_I24,
- MAD_U24,
- MAD_I24,
- MAD_U64_U32,
- MAD_I64_I32,
- PERM,
- TEXTURE_FETCH,
- R600_EXPORT,
- CONST_ADDRESS,
- REGISTER_LOAD,
- REGISTER_STORE,
-
- // These cvt_f32_ubyte* nodes need to remain consecutive and in order.
- CVT_F32_UBYTE0,
- CVT_F32_UBYTE1,
- CVT_F32_UBYTE2,
- CVT_F32_UBYTE3,
-
- // Convert two float 32 numbers into a single register holding two packed f16
- // with round to zero.
- CVT_PKRTZ_F16_F32,
- CVT_PKNORM_I16_F32,
- CVT_PKNORM_U16_F32,
- CVT_PK_I16_I32,
- CVT_PK_U16_U32,
-
- // Same as the standard node, except the high bits of the resulting integer
- // are known 0.
- FP_TO_FP16,
-
- /// This node is for VLIW targets and it is used to represent a vector
- /// that is stored in consecutive registers with the same channel.
- /// For example:
- /// |X |Y|Z|W|
- /// T0|v.x| | | |
- /// T1|v.y| | | |
- /// T2|v.z| | | |
- /// T3|v.w| | | |
- BUILD_VERTICAL_VECTOR,
- /// Pointer to the start of the shader's constant data.
- CONST_DATA_PTR,
- PC_ADD_REL_OFFSET,
- PC_ADD_REL_OFFSET64,
- LDS,
-
- DUMMY_CHAIN,
-
- FIRST_MEMORY_OPCODE,
- LOAD_D16_HI = FIRST_MEMORY_OPCODE,
- LOAD_D16_LO,
- LOAD_D16_HI_I8,
- LOAD_D16_HI_U8,
- LOAD_D16_LO_I8,
- LOAD_D16_LO_U8,
-
- STORE_MSKOR,
- TBUFFER_STORE_FORMAT,
- TBUFFER_STORE_FORMAT_D16,
- TBUFFER_LOAD_FORMAT,
- TBUFFER_LOAD_FORMAT_D16,
- DS_ORDERED_COUNT,
- ATOMIC_CMP_SWAP,
- BUFFER_LOAD,
- BUFFER_LOAD_UBYTE,
- BUFFER_LOAD_USHORT,
- BUFFER_LOAD_BYTE,
- BUFFER_LOAD_SHORT,
- BUFFER_LOAD_TFE,
- BUFFER_LOAD_UBYTE_TFE,
- BUFFER_LOAD_USHORT_TFE,
- BUFFER_LOAD_BYTE_TFE,
- BUFFER_LOAD_SHORT_TFE,
- BUFFER_LOAD_FORMAT,
- BUFFER_LOAD_FORMAT_TFE,
- BUFFER_LOAD_FORMAT_D16,
- SBUFFER_LOAD,
- SBUFFER_LOAD_BYTE,
- SBUFFER_LOAD_UBYTE,
- SBUFFER_LOAD_SHORT,
- SBUFFER_LOAD_USHORT,
- SBUFFER_PREFETCH_DATA,
- BUFFER_STORE,
- BUFFER_STORE_BYTE,
- BUFFER_STORE_SHORT,
- BUFFER_STORE_FORMAT,
- BUFFER_STORE_FORMAT_D16,
- BUFFER_ATOMIC_SWAP,
- BUFFER_ATOMIC_ADD,
- BUFFER_ATOMIC_SUB,
- BUFFER_ATOMIC_SMIN,
- BUFFER_ATOMIC_UMIN,
- BUFFER_ATOMIC_SMAX,
- BUFFER_ATOMIC_UMAX,
- BUFFER_ATOMIC_AND,
- BUFFER_ATOMIC_OR,
- BUFFER_ATOMIC_XOR,
- BUFFER_ATOMIC_INC,
- BUFFER_ATOMIC_DEC,
- BUFFER_ATOMIC_CMPSWAP,
- BUFFER_ATOMIC_CSUB,
- BUFFER_ATOMIC_FADD,
- BUFFER_ATOMIC_FMIN,
- BUFFER_ATOMIC_FMAX,
- BUFFER_ATOMIC_COND_SUB_U32,
- LAST_MEMORY_OPCODE = BUFFER_ATOMIC_COND_SUB_U32,
-
- // Set up a whole wave function.
- WHOLE_WAVE_SETUP,
-
- // Return from a whole wave function.
- WHOLE_WAVE_RETURN,
-};
-
-} // End namespace AMDGPUISD
-
} // End namespace llvm
#endif
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
index 9a90787..5f4ca82 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
@@ -221,7 +221,7 @@ public:
};
// A map from regunits to the delay info for that regunit.
- struct DelayState : DenseMap<unsigned, DelayInfo> {
+ struct DelayState : DenseMap<MCRegUnit, DelayInfo> {
// Merge another DelayState into this one by merging the delay info for each
// regunit.
void merge(const DelayState &RHS) {
@@ -359,7 +359,8 @@ public:
bool Changed = false;
MachineInstr *LastDelayAlu = nullptr;
- MCRegUnit LastSGPRFromVALU = 0;
+ // FIXME: 0 is a valid register unit.
+ MCRegUnit LastSGPRFromVALU = static_cast<MCRegUnit>(0);
// Iterate over the contents of bundles, but don't emit any instructions
// inside a bundle.
for (auto &MI : MBB.instrs()) {
@@ -379,7 +380,8 @@ public:
if (It != State.end()) {
DelayInfo Info = It->getSecond();
State.advanceByVALUNum(Info.VALUNum);
- LastSGPRFromVALU = 0;
+ // FIXME: 0 is a valid register unit.
+ LastSGPRFromVALU = static_cast<MCRegUnit>(0);
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 4fe5d00..4792673 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -35,7 +35,7 @@ struct AMDGPUImageDMaskIntrinsic {
};
#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
-#include "InstCombineTables.inc"
+#include "AMDGPUGenSearchableTables.inc"
} // end anonymous namespace
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
index 5085e86..2b1f404 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
@@ -29,11 +29,19 @@ Intrinsic::ID AMDGPU::getIntrinsicID(const MachineInstr &I) {
// TODO: Should largely merge with AMDGPUTTIImpl::isSourceOfDivergence.
bool AMDGPU::isUniformMMO(const MachineMemOperand *MMO) {
const Value *Ptr = MMO->getValue();
+ if (!Ptr) {
+ if (const PseudoSourceValue *PSV = MMO->getPseudoValue()) {
+ return PSV->isConstantPool() || PSV->isStack() || PSV->isGOT() ||
+ PSV->isJumpTable();
+ }
+
+ // Unknown value.
+ return false;
+ }
+
// UndefValue means this is a load of a kernel input. These are uniform.
// Sometimes LDS instructions have constant pointers.
- // If Ptr is null, then that means this mem operand contains a
- // PseudoSourceValue like GOT.
- if (!Ptr || isa<UndefValue, Constant, GlobalValue>(Ptr))
+ if (isa<UndefValue, Constant, GlobalValue>(Ptr))
return true;
if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index b8fa6f3..8a43c2d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -62,6 +62,7 @@ def AMDGPUIfBreakOp : SDTypeProfile<1, 2,
// AMDGPU DAG Nodes
//
+// Masked control flow nodes.
def AMDGPUif : SDNode<"AMDGPUISD::IF", AMDGPUIfOp, [SDNPHasChain]>;
def AMDGPUelse : SDNode<"AMDGPUISD::ELSE", AMDGPUElseOp, [SDNPHasChain]>;
def AMDGPUloop : SDNode<"AMDGPUISD::LOOP", AMDGPULoopOp, [SDNPHasChain]>;
@@ -114,6 +115,7 @@ def AMDGPUtrap : SDNode<"AMDGPUISD::TRAP",
[SDNPHasChain, SDNPVariadic, SDNPSideEffect, SDNPOptInGlue]
>;
+// Pointer to the start of the shader's constant data.
def AMDGPUconstdata_ptr : SDNode<
"AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 1, [SDTCisVT<0, iPTR>,
SDTCisVT<0, iPTR>]>
@@ -122,18 +124,21 @@ def AMDGPUconstdata_ptr : SDNode<
// This argument to this node is a dword address.
def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>;
+// SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi.
+// Denormals handled on some parts.
def AMDGPUcos_impl : SDNode<"AMDGPUISD::COS_HW", SDTFPUnaryOp>;
def AMDGPUsin_impl : SDNode<"AMDGPUISD::SIN_HW", SDTFPUnaryOp>;
+
// out = a - floor(a)
def AMDGPUfract_impl : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>;
// out = 1.0 / a
def AMDGPUrcp_impl : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>;
-// v_log_f32, which is log2
+// v_log_f32, which is log2, no denormal handling for f32.
def AMDGPUlog_impl : SDNode<"AMDGPUISD::LOG", SDTFPUnaryOp>;
-// v_exp_f32, which is exp2
+// v_exp_f32, which is exp2, no denormal handling for f32.
def AMDGPUexp_impl : SDNode<"AMDGPUISD::EXP", SDTFPUnaryOp>;
// out = 1.0 / sqrt(a)
@@ -146,11 +151,16 @@ def AMDGPUrcp_iflag : SDNode<"AMDGPUISD::RCP_IFLAG", SDTFPUnaryOp>;
// out = 1.0 / sqrt(a) result clamped to +/- max_float.
def AMDGPUrsq_clamp_impl : SDNode<"AMDGPUISD::RSQ_CLAMP", SDTFPUnaryOp>;
+// Convert two float 32 numbers into a single register holding two packed f16
+// with round to zero.
def AMDGPUpkrtz_f16_f32_impl : SDNode<"AMDGPUISD::CVT_PKRTZ_F16_F32", AMDGPUFPPackOp>;
def AMDGPUpknorm_i16_f32_impl : SDNode<"AMDGPUISD::CVT_PKNORM_I16_F32", AMDGPUFPPackOp>;
def AMDGPUpknorm_u16_f32_impl : SDNode<"AMDGPUISD::CVT_PKNORM_U16_F32", AMDGPUFPPackOp>;
def AMDGPUpk_i16_i32_impl : SDNode<"AMDGPUISD::CVT_PK_I16_I32", AMDGPUIntPackOp>;
def AMDGPUpk_u16_u32_impl : SDNode<"AMDGPUISD::CVT_PK_U16_U32", AMDGPUIntPackOp>;
+
+// Same as the standard node, except the high bits of the resulting integer
+// are known 0.
def AMDGPUfp_to_f16 : SDNode<"AMDGPUISD::FP_TO_FP16" , SDTFPToIntOp>;
@@ -225,14 +235,18 @@ def AMDGPUSetCCOp : SDTypeProfile<1, 3, [ // setcc
SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, OtherVT>
]>;
+// This is SETCC with the full mask result which is used for a compare with a
+// result bit per item in the wavefront.
def AMDGPUsetcc : SDNode<"AMDGPUISD::SETCC", AMDGPUSetCCOp>;
+// FP ops with input and output chain.
def AMDGPUfma : SDNode<"AMDGPUISD::FMA_W_CHAIN", SDTFPTernaryOp, [
SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
def AMDGPUmul : SDNode<"AMDGPUISD::FMUL_W_CHAIN", SDTFPBinOp, [
SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+// These cvt_f32_ubyte* nodes need to remain consecutive and in order.
def AMDGPUcvt_f32_ubyte0 : SDNode<"AMDGPUISD::CVT_F32_UBYTE0",
SDTIntToFPOp, []>;
def AMDGPUcvt_f32_ubyte1 : SDNode<"AMDGPUISD::CVT_F32_UBYTE1",
@@ -264,6 +278,8 @@ def AMDGPUdiv_fmas_impl : SDNode<"AMDGPUISD::DIV_FMAS", AMDGPUFmasOp,
// Denominator, src2 = Numerator).
def AMDGPUdiv_fixup_impl : SDNode<"AMDGPUISD::DIV_FIXUP", SDTFPTernaryOp>;
+// For emitting ISD::FMAD when f32 denormals are enabled because mac/mad is
+// treated as an illegal operation.
def AMDGPUfmad_ftz_impl : SDNode<"AMDGPUISD::FMAD_FTZ", SDTFPTernaryOp>;
def AMDGPUregister_load : SDNode<"AMDGPUISD::REGISTER_LOAD",
@@ -290,14 +306,23 @@ def AMDGPUatomic_cmp_swap : SDNode<"AMDGPUISD::ATOMIC_CMP_SWAP",
[SDNPHasChain, SDNPMayStore, SDNPMayLoad,
SDNPMemOperand]>;
+// Extract range of bits with zero extension to 32-bits.
def AMDGPUbfe_u32_impl : SDNode<"AMDGPUISD::BFE_U32", AMDGPUDTIntTernaryOp>;
+
+// Extract range of bits with sign extension to 32-bits.
def AMDGPUbfe_i32_impl : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>;
+
+// (src0 & src1) | (~src0 & src2)
def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>;
+
+// Insert a range of bits into a 32-bit word.
def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>;
+// ctlz with -1 if input is zero.
def AMDGPUffbh_u32_impl : SDNode<"AMDGPUISD::FFBH_U32", SDTIntBitCountUnaryOp>;
def AMDGPUffbh_i32_impl : SDNode<"AMDGPUISD::FFBH_I32", SDTIntBitCountUnaryOp>;
+// cttz with -1 if input is zero.
def AMDGPUffbl_b32_impl : SDNode<"AMDGPUISD::FFBL_B32", SDTIntBitCountUnaryOp>;
// Signed and unsigned 24-bit multiply. The highest 8-bits are ignore
@@ -394,16 +419,24 @@ def IL_brcond : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChai
//===----------------------------------------------------------------------===//
// Call/Return DAG Nodes
//===----------------------------------------------------------------------===//
+
+// A uniform kernel return that terminates the wavefront.
def AMDGPUendpgm : SDNode<"AMDGPUISD::ENDPGM", SDTNone,
[SDNPHasChain, SDNPOptInGlue]>;
+
+// s_endpgm, but we may want to insert it in the middle of the block.
def AMDGPUendpgm_trap : SDNode<"AMDGPUISD::ENDPGM_TRAP", SDTNone,
[SDNPHasChain]>;
+
+// "s_trap 2" equivalent on hardware that does not support it.
def AMDGPUsimulated_trap : SDNode<"AMDGPUISD::SIMULATED_TRAP", SDTNone,
[SDNPHasChain]>;
+// Return to a shader part's epilog code.
def AMDGPUreturn_to_epilog : SDNode<"AMDGPUISD::RETURN_TO_EPILOG", SDTNone,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+// Return with values from a non-entry function.
def AMDGPUret_glue : SDNode<"AMDGPUISD::RET_GLUE", SDTNone,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 9ce1224..1549214 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -116,8 +116,14 @@ bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
if (!DstRC || DstRC != SrcRC)
return false;
- return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
- RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
+ if (!RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) ||
+ !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
+ return false;
+ const MCInstrDesc &MCID = MI.getDesc();
+ if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
+ MI.getOperand(0).setIsEarlyClobber(true);
+ }
+ return true;
}
bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
@@ -221,12 +227,21 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {
const DebugLoc &DL = I.getDebugLoc();
MachineBasicBlock *BB = I.getParent();
+ Register VCCReg = I.getOperand(1).getReg();
+ MachineInstr *Cmp;
+
+ // Set SCC as a side effect with S_CMP or S_OR.
+ if (STI.hasScalarCompareEq64()) {
+ unsigned CmpOpc =
+ STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
+ Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)).addReg(VCCReg).addImm(0);
+ } else {
+ Register DeadDst = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ Cmp = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_OR_B64), DeadDst)
+ .addReg(VCCReg)
+ .addReg(VCCReg);
+ }
- unsigned CmpOpc =
- STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
- MachineInstr *Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc))
- .addReg(I.getOperand(1).getReg())
- .addImm(0);
if (!constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI))
return false;
@@ -593,6 +608,7 @@ bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
I.setDesc(TII.get(Opc));
I.addOperand(*MF, MachineOperand::CreateImm(0));
I.addImplicitDefUseOperands(*MF);
+ I.getOperand(0).setIsEarlyClobber(true);
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
@@ -1930,20 +1946,52 @@ bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
// The resource id offset is computed as (<isa opaque base> + M0[21:16] +
// offset field) % 64. Some versions of the programming guide omit the m0
// part, or claim it's from offset 0.
- auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
+
+ unsigned Opc = gwsIntrinToOpcode(IID);
+ const MCInstrDesc &InstrDesc = TII.get(Opc);
if (HasVSrc) {
Register VSrc = MI.getOperand(1).getReg();
- MIB.addReg(VSrc);
- if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
- return false;
- }
+ int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
+ const TargetRegisterClass *DataRC = TII.getRegClass(InstrDesc, Data0Idx);
+ const TargetRegisterClass *SubRC =
+ TRI.getSubRegisterClass(DataRC, AMDGPU::sub0);
- MIB.addImm(ImmOffset)
- .cloneMemRefs(MI);
+ if (!SubRC) {
+ // 32-bit normal case.
+ if (!RBI.constrainGenericRegister(VSrc, *DataRC, *MRI))
+ return false;
- TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::data0);
+ BuildMI(*MBB, &MI, DL, InstrDesc)
+ .addReg(VSrc)
+ .addImm(ImmOffset)
+ .cloneMemRefs(MI);
+ } else {
+ // Requires even register alignment, so create 64-bit value and pad the
+ // top half with undef.
+ Register DataReg = MRI->createVirtualRegister(DataRC);
+ if (!RBI.constrainGenericRegister(VSrc, *SubRC, *MRI))
+ return false;
+
+ Register UndefReg = MRI->createVirtualRegister(SubRC);
+ BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
+ BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), DataReg)
+ .addReg(VSrc)
+ .addImm(AMDGPU::sub0)
+ .addReg(UndefReg)
+ .addImm(AMDGPU::sub1);
+
+ BuildMI(*MBB, &MI, DL, InstrDesc)
+ .addReg(DataReg)
+ .addImm(ImmOffset)
+ .cloneMemRefs(MI);
+ }
+ } else {
+ BuildMI(*MBB, &MI, DL, InstrDesc)
+ .addImm(ImmOffset)
+ .cloneMemRefs(MI);
+ }
MI.eraseFromParent();
return true;
@@ -1982,7 +2030,7 @@ bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
}
bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
- MachineFunction *MF = MI.getParent()->getParent();
+ MachineFunction *MF = MI.getMF();
SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>();
MFInfo->setInitWholeWave();
@@ -3674,7 +3722,7 @@ bool AMDGPUInstructionSelector::selectBVHIntersectRayIntrinsic(
MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY ? 1 : 3;
MI.setDesc(TII.get(MI.getOperand(OpcodeOpIdx).getImm()));
MI.removeOperand(OpcodeOpIdx);
- MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
+ MI.addImplicitDefUseOperands(*MI.getMF());
return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
}
@@ -3777,7 +3825,11 @@ bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
MI.removeOperand(4); // VDst_In
MI.removeOperand(1); // Intrinsic ID
MI.addOperand(VDst_In); // Readd VDst_In to the end
- MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
+ MI.addImplicitDefUseOperands(*MI.getMF());
+ const MCInstrDesc &MCID = MI.getDesc();
+ if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
+ MI.getOperand(0).setIsEarlyClobber(true);
+ }
return true;
}
@@ -4149,6 +4201,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
case TargetOpcode::G_ATOMICRMW_UMAX:
case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
+ case TargetOpcode::G_ATOMICRMW_USUB_COND:
+ case TargetOpcode::G_ATOMICRMW_USUB_SAT:
case TargetOpcode::G_ATOMICRMW_FADD:
case TargetOpcode::G_ATOMICRMW_FMIN:
case TargetOpcode::G_ATOMICRMW_FMAX:
@@ -6744,7 +6798,7 @@ bool AMDGPUInstructionSelector::selectSGetBarrierState(
MachineInstr &I, Intrinsic::ID IntrID) const {
MachineBasicBlock *MBB = I.getParent();
const DebugLoc &DL = I.getDebugLoc();
- MachineOperand BarOp = I.getOperand(2);
+ const MachineOperand &BarOp = I.getOperand(2);
std::optional<int64_t> BarValImm =
getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
@@ -6797,8 +6851,8 @@ bool AMDGPUInstructionSelector::selectNamedBarrierInit(
MachineInstr &I, Intrinsic::ID IntrID) const {
MachineBasicBlock *MBB = I.getParent();
const DebugLoc &DL = I.getDebugLoc();
- MachineOperand BarOp = I.getOperand(1);
- MachineOperand CntOp = I.getOperand(2);
+ const MachineOperand &BarOp = I.getOperand(1);
+ const MachineOperand &CntOp = I.getOperand(2);
// BarID = (BarOp >> 4) & 0x3F
Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index bd443b5..2a99dac 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -636,15 +636,11 @@ multiclass local_addr_space_atomic_op {
}
}
-defm int_amdgcn_global_atomic_csub : noret_op;
defm int_amdgcn_global_atomic_ordered_add_b64 : noret_op;
defm int_amdgcn_flat_atomic_fmin_num : noret_op;
defm int_amdgcn_flat_atomic_fmax_num : noret_op;
defm int_amdgcn_global_atomic_fmin_num : noret_op;
defm int_amdgcn_global_atomic_fmax_num : noret_op;
-defm int_amdgcn_atomic_cond_sub_u32 : local_addr_space_atomic_op;
-defm int_amdgcn_atomic_cond_sub_u32 : flat_addr_space_atomic_op;
-defm int_amdgcn_atomic_cond_sub_u32 : global_addr_space_atomic_op;
multiclass noret_binary_atomic_op<SDNode atomic_op> {
let HasNoUse = true in
@@ -695,6 +691,8 @@ defm atomic_load_fmin : binary_atomic_op_fp_all_as<atomic_load_fmin>;
defm atomic_load_fmax : binary_atomic_op_fp_all_as<atomic_load_fmax>;
defm atomic_load_uinc_wrap : binary_atomic_op_all_as<atomic_load_uinc_wrap>;
defm atomic_load_udec_wrap : binary_atomic_op_all_as<atomic_load_udec_wrap>;
+defm atomic_load_usub_cond : binary_atomic_op_all_as<atomic_load_usub_cond>;
+defm atomic_load_usub_sat : binary_atomic_op_all_as<atomic_load_usub_sat>;
defm AMDGPUatomic_cmp_swap : binary_atomic_op_all_as<AMDGPUatomic_cmp_swap>;
def load_align8_local : PatFrag<(ops node:$ptr), (load_local node:$ptr)>,
@@ -806,12 +804,6 @@ class DwordAddrPat<ValueType vt, RegisterClass rc> : AMDGPUPat <
(vt rc:$addr)
>;
-// rotr pattern
-class ROTRPattern <Instruction BIT_ALIGN> : AMDGPUPat <
- (rotr i32:$src0, i32:$src1),
- (BIT_ALIGN $src0, $src0, $src1)
->;
-
// Special conversion patterns
def cvt_rpi_i32_f32 : PatFrag <
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 1a13b22..cb1a4ee 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -30,6 +30,7 @@
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/PseudoSourceValueManager.h"
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
@@ -1721,6 +1722,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
}
+ auto &Atomics32 =
+ getActionDefinitionsBuilder({G_ATOMICRMW_USUB_COND, G_ATOMICRMW_USUB_SAT})
+ .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, {S32, RegionPtr}});
+ if (ST.hasFlatAddressSpace()) {
+ Atomics32.legalFor({{S32, FlatPtr}});
+ }
+
// TODO: v2bf16 operations, and fat buffer pointer support.
auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
if (ST.hasLDSFPAtomicAddF32()) {
@@ -2321,14 +2329,14 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
return B.buildUnmerge(S32, Dst).getReg(1);
}
- // TODO: can we be smarter about machine pointer info?
- MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
Register LoadAddr = MRI.createGenericVirtualRegister(
LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
// For code object version 5, private_base and shared_base are passed through
// implicit kernargs.
if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >=
AMDGPU::AMDHSA_COV5) {
+ MachinePointerInfo PtrInfo = getKernargSegmentPtrInfo(B.getMF());
+
AMDGPUTargetLowering::ImplicitParameter Param =
AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
: AMDGPUTargetLowering::PRIVATE_BASE;
@@ -2343,7 +2351,7 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
return Register();
MachineMemOperand *MMO = MF.getMachineMemOperand(
- PtrInfo,
+ PtrInfo.getWithOffset(Offset),
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant,
LLT::scalar(32), commonAlignment(Align(64), Offset));
@@ -2361,6 +2369,9 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
return Register();
+ // TODO: Use custom PseudoSourceValue
+ MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
+
// Offset into amd_queue_t for group_segment_aperture_base_hi /
// private_segment_aperture_base_hi.
uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
@@ -2560,8 +2571,14 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
uint32_t AddrHiVal = Info->get32BitAddressHighBits();
auto PtrLo = B.buildPtrToInt(S32, Src);
- auto HighAddr = B.buildConstant(S32, AddrHiVal);
- B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
+ if (AddrHiVal == 0) {
+ auto Zext = B.buildZExt(LLT::scalar(64), PtrLo);
+ B.buildIntToPtr(Dst, Zext);
+ } else {
+ auto HighAddr = B.buildConstant(S32, AddrHiVal);
+ B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
+ }
+
MI.eraseFromParent();
return true;
}
@@ -3551,12 +3568,14 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
-
- R = B.buildFMul(Ty, Y, C, Flags).getReg(0);
- auto NegR = B.buildFNeg(Ty, R, Flags);
- auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags);
- auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags);
- R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
+ // This adds correction terms for which contraction may lead to an increase
+ // in the error of the approximation, so disable it.
+ auto NewFlags = Flags & ~(MachineInstr::FmContract);
+ R = B.buildFMul(Ty, Y, C, NewFlags).getReg(0);
+ auto NegR = B.buildFNeg(Ty, R, NewFlags);
+ auto FMA0 = B.buildFMA(Ty, Y, C, NegR, NewFlags);
+ auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, NewFlags);
+ R = B.buildFAdd(Ty, R, FMA1, NewFlags).getReg(0);
} else {
// ch+ct is ln(2)/ln(10) to more than 36 bits
const float ch_log10 = 0x1.344000p-2f;
@@ -3572,12 +3591,15 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
auto MaskConst = B.buildConstant(Ty, 0xfffff000);
auto YH = B.buildAnd(Ty, Y, MaskConst);
auto YT = B.buildFSub(Ty, Y, YH, Flags);
- auto YTCT = B.buildFMul(Ty, YT, CT, Flags);
+ // This adds correction terms for which contraction may lead to an increase
+ // in the error of the approximation, so disable it.
+ auto NewFlags = Flags & ~(MachineInstr::FmContract);
+ auto YTCT = B.buildFMul(Ty, YT, CT, NewFlags);
Register Mad0 =
- getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
- Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags);
- R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags);
+ getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), NewFlags);
+ Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, NewFlags);
+ R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, NewFlags);
}
const bool IsFiniteOnly =
@@ -3706,24 +3728,39 @@ bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI,
return true;
}
+static MachineInstrBuilder buildExp(MachineIRBuilder &B, const DstOp &Dst,
+ const SrcOp &Src, unsigned Flags) {
+ LLT Ty = Dst.getLLTTy(*B.getMRI());
+
+ if (Ty == LLT::scalar(32)) {
+ return B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Dst})
+ .addUse(Src.getReg())
+ .setMIFlags(Flags);
+ }
+ return B.buildFExp2(Dst, Src, Flags);
+}
+
+bool AMDGPULegalizerInfo::legalizeFExpUnsafeImpl(MachineIRBuilder &B,
+ Register Dst, Register X,
+ unsigned Flags,
+ bool IsExp10) const {
+ LLT Ty = B.getMRI()->getType(X);
+
+ // exp(x) -> exp2(M_LOG2E_F * x);
+ // exp10(x) -> exp2(log2(10) * x);
+ auto Const = B.buildFConstant(Ty, IsExp10 ? 0x1.a934f0p+1f : numbers::log2e);
+ auto Mul = B.buildFMul(Ty, X, Const, Flags);
+ buildExp(B, Dst, Mul, Flags);
+ return true;
+}
+
bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst,
Register X, unsigned Flags) const {
LLT Ty = B.getMRI()->getType(Dst);
LLT F32 = LLT::scalar(32);
if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
- auto Log2E = B.buildFConstant(Ty, numbers::log2e);
- auto Mul = B.buildFMul(Ty, X, Log2E, Flags);
-
- if (Ty == F32) {
- B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
- .addUse(Mul.getReg(0))
- .setMIFlags(Flags);
- } else {
- B.buildFExp2(Dst, Mul.getReg(0), Flags);
- }
-
- return true;
+ return legalizeFExpUnsafeImpl(B, Dst, X, Flags, /*IsExp10=*/false);
}
auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
@@ -3746,6 +3783,55 @@ bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst,
return true;
}
+bool AMDGPULegalizerInfo::legalizeFExp10Unsafe(MachineIRBuilder &B,
+ Register Dst, Register X,
+ unsigned Flags) const {
+ LLT Ty = B.getMRI()->getType(Dst);
+ LLT F32 = LLT::scalar(32);
+
+ if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
+ // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
+ auto K0 = B.buildFConstant(Ty, 0x1.a92000p+1f);
+ auto K1 = B.buildFConstant(Ty, 0x1.4f0978p-11f);
+
+ auto Mul1 = B.buildFMul(Ty, X, K1, Flags);
+ auto Exp2_1 = buildExp(B, Ty, Mul1, Flags);
+ auto Mul0 = B.buildFMul(Ty, X, K0, Flags);
+ auto Exp2_0 = buildExp(B, Ty, Mul0, Flags);
+ B.buildFMul(Dst, Exp2_0, Exp2_1, Flags);
+ return true;
+ }
+
+ // bool s = x < -0x1.2f7030p+5f;
+ // x += s ? 0x1.0p+5f : 0.0f;
+ // exp10 = exp2(x * 0x1.a92000p+1f) *
+ // exp2(x * 0x1.4f0978p-11f) *
+ // (s ? 0x1.9f623ep-107f : 1.0f);
+
+ auto Threshold = B.buildFConstant(Ty, -0x1.2f7030p+5f);
+ auto NeedsScaling =
+ B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold);
+
+ auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+5f);
+ auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
+ auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X);
+
+ auto K0 = B.buildFConstant(Ty, 0x1.a92000p+1f);
+ auto K1 = B.buildFConstant(Ty, 0x1.4f0978p-11f);
+
+ auto Mul1 = B.buildFMul(Ty, AdjustedX, K1, Flags);
+ auto Exp2_1 = buildExp(B, Ty, Mul1, Flags);
+ auto Mul0 = B.buildFMul(Ty, AdjustedX, K0, Flags);
+ auto Exp2_0 = buildExp(B, Ty, Mul0, Flags);
+
+ auto MulExps = B.buildFMul(Ty, Exp2_0, Exp2_1, Flags);
+ auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.9f623ep-107f);
+ auto AdjustedResult = B.buildFMul(Ty, MulExps, ResultScaleFactor, Flags);
+
+ B.buildSelect(Dst, NeedsScaling, AdjustedResult, MulExps);
+ return true;
+}
+
bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
MachineIRBuilder &B) const {
Register Dst = MI.getOperand(0).getReg();
@@ -3762,18 +3848,22 @@ bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
// v_exp_f16 (fmul x, log2e)
if (allowApproxFunc(MF, Flags)) {
// TODO: Does this really require fast?
- legalizeFExpUnsafe(B, Dst, X, Flags);
+ IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags)
+ : legalizeFExpUnsafe(B, Dst, X, Flags);
MI.eraseFromParent();
return true;
}
+ // Nothing in half is a denormal when promoted to f32.
+ //
// exp(f16 x) ->
// fptrunc (v_exp_f32 (fmul (fpext x), log2e))
-
- // Nothing in half is a denormal when promoted to f32.
+ //
+ // exp10(f16 x) ->
+ // fptrunc (v_exp_f32 (fmul (fpext x), log2(10)))
auto Ext = B.buildFPExt(F32, X, Flags);
Register Lowered = MRI.createGenericVirtualRegister(F32);
- legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags);
+ legalizeFExpUnsafeImpl(B, Lowered, Ext.getReg(0), Flags, IsExp10);
B.buildFPTrunc(Dst, Lowered, Flags);
MI.eraseFromParent();
return true;
@@ -3784,7 +3874,8 @@ bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
// TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
// library behavior. Also, is known-not-daz source sufficient?
if (allowApproxFunc(MF, Flags)) {
- legalizeFExpUnsafe(B, Dst, X, Flags);
+ IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags)
+ : legalizeFExpUnsafe(B, Dst, X, Flags);
MI.eraseFromParent();
return true;
}
@@ -4709,6 +4800,14 @@ bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
return true;
}
+MachinePointerInfo
+AMDGPULegalizerInfo::getKernargSegmentPtrInfo(MachineFunction &MF) const {
+ // This isn't really a constant pool but close enough.
+ MachinePointerInfo PtrInfo(MF.getPSVManager().getConstantPool());
+ PtrInfo.AddrSpace = AMDGPUAS::CONSTANT_ADDRESS;
+ return PtrInfo;
+}
+
Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
int64_t Offset) const {
LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
@@ -4736,8 +4835,8 @@ bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
"unexpected kernarg parameter type");
Register Ptr = getKernargParameterPtr(B, Offset);
- MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
- B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
+ MachinePointerInfo PtrInfo = getKernargSegmentPtrInfo(B.getMF());
+ B.buildLoad(DstReg, Ptr, PtrInfo.getWithOffset(Offset), Align(4),
MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant);
MI.eraseFromParent();
@@ -6538,8 +6637,15 @@ static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
+ case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
+ case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
+ return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32;
case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
default:
llvm_unreachable("unhandled atomic opcode");
@@ -7260,9 +7366,9 @@ bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
return false;
// TODO: can we be smarter about machine pointer info?
- MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
+ MachinePointerInfo PtrInfo = getKernargSegmentPtrInfo(MF);
MachineMemOperand *MMO = MF.getMachineMemOperand(
- PtrInfo,
+ PtrInfo.getWithOffset(Offset),
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant,
LLT::scalar(64), commonAlignment(Align(64), Offset));
@@ -7724,7 +7830,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_make_buffer_rsrc:
return legalizePointerAsRsrcIntrin(MI, MRI, B);
case Intrinsic::amdgcn_kernarg_segment_ptr:
- if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
+ if (!AMDGPU::isKernel(B.getMF().getFunction())) {
// This only makes sense to call in a kernel, so just lower to null.
B.buildConstant(MI.getOperand(0).getReg(), 0);
MI.eraseFromParent();
@@ -7947,6 +8053,14 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
+ case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
+ case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
+ case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
+ case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index cd44a9b..1224ee7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -91,8 +91,12 @@ public:
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src,
bool IsLog10, unsigned Flags) const;
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const;
+ bool legalizeFExpUnsafeImpl(MachineIRBuilder &B, Register Dst, Register Src,
+ unsigned Flags, bool IsExp10) const;
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src,
unsigned Flags) const;
+ bool legalizeFExp10Unsafe(MachineIRBuilder &B, Register Dst, Register Src,
+ unsigned Flags) const;
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const;
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const;
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI,
@@ -132,6 +136,7 @@ public:
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const;
+ MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const;
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const;
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B,
uint64_t Offset,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index aa75534..821d7f38 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -845,7 +845,7 @@ bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) {
return false;
}
}
- LLVMContext &context = CI->getParent()->getParent()->getContext();
+ LLVMContext &context = CI->getContext();
Constant *nval;
if (getArgType(FInfo) == AMDGPULibFunc::F32) {
SmallVector<float, 0> FVal;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 0a59132..97e7a23 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -1565,8 +1565,11 @@ void SplitPtrStructs::processConditionals() {
} else if (isa<SelectInst>(I)) {
if (MaybeRsrc) {
if (auto *RsrcInst = dyn_cast<Instruction>(Rsrc)) {
- ConditionalTemps.push_back(RsrcInst);
- RsrcInst->replaceAllUsesWith(*MaybeRsrc);
+ // Guard against conditionals that were already folded away.
+ if (RsrcInst != *MaybeRsrc) {
+ ConditionalTemps.push_back(RsrcInst);
+ RsrcInst->replaceAllUsesWith(*MaybeRsrc);
+ }
}
for (Value *V : Seen)
FoundRsrcs[V] = *MaybeRsrc;
@@ -1745,6 +1748,12 @@ Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr,
case AtomicRMWInst::FMin:
IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin;
break;
+ case AtomicRMWInst::USubCond:
+ IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32;
+ break;
+ case AtomicRMWInst::USubSat:
+ IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32;
+ break;
case AtomicRMWInst::FSub: {
reportFatalUsageError(
"atomic floating point subtraction not supported for "
@@ -1770,14 +1779,12 @@ Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr,
break;
case AtomicRMWInst::UIncWrap:
case AtomicRMWInst::UDecWrap:
- reportFatalUsageError("wrapping increment/decrement not supported for "
- "buffer resources and should've ben expanded away");
+ reportFatalUsageError(
+ "wrapping increment/decrement not supported for "
+ "buffer resources and should've been expanded away");
break;
case AtomicRMWInst::BAD_BINOP:
llvm_unreachable("Not sure how we got a bad binop");
- case AtomicRMWInst::USubCond:
- case AtomicRMWInst::USubSat:
- break;
}
}
@@ -2059,17 +2066,7 @@ PtrParts SplitPtrStructs::visitICmpInst(ICmpInst &Cmp) {
"Pointer comparison is only equal or unequal");
auto [LhsRsrc, LhsOff] = getPtrParts(Lhs);
auto [RhsRsrc, RhsOff] = getPtrParts(Rhs);
- Value *RsrcCmp =
- IRB.CreateICmp(Pred, LhsRsrc, RhsRsrc, Cmp.getName() + ".rsrc");
- copyMetadata(RsrcCmp, &Cmp);
- Value *OffCmp = IRB.CreateICmp(Pred, LhsOff, RhsOff, Cmp.getName() + ".off");
- copyMetadata(OffCmp, &Cmp);
-
- Value *Res = nullptr;
- if (Pred == ICmpInst::ICMP_EQ)
- Res = IRB.CreateAnd(RsrcCmp, OffCmp);
- else if (Pred == ICmpInst::ICMP_NE)
- Res = IRB.CreateOr(RsrcCmp, OffCmp);
+ Value *Res = IRB.CreateICmp(Pred, LhsOff, RhsOff);
copyMetadata(Res, &Cmp);
Res->takeName(&Cmp);
SplitUsers.insert(&Cmp);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp
new file mode 100644
index 0000000..38b01dc
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp
@@ -0,0 +1,240 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Lower LDS global variables with target extension type "amdgpu.named.barrier"
+// that require specialized address assignment. It assigns a unique
+// barrier identifier to each named-barrier LDS variable and encodes
+// this identifier within the !absolute_symbol metadata of that global.
+// This encoding ensures that subsequent LDS lowering passes can process these
+// barriers correctly without conflicts.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUMemoryUtils.h"
+#include "AMDGPUTargetMachine.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/ReplaceConstant.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+
+#include <algorithm>
+
+#define DEBUG_TYPE "amdgpu-lower-exec-sync"
+
+using namespace llvm;
+using namespace AMDGPU;
+
+namespace {
+
+// If GV is also used directly by other kernels, create a new GV
+// used only by this kernel and its function.
+static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
+ Function *KF) {
+ bool NeedsReplacement = false;
+ for (Use &U : GV->uses()) {
+ if (auto *I = dyn_cast<Instruction>(U.getUser())) {
+ Function *F = I->getFunction();
+ if (isKernel(*F) && F != KF) {
+ NeedsReplacement = true;
+ break;
+ }
+ }
+ }
+ if (!NeedsReplacement)
+ return GV;
+ // Create a new GV used only by this kernel and its function
+ GlobalVariable *NewGV = new GlobalVariable(
+ M, GV->getValueType(), GV->isConstant(), GV->getLinkage(),
+ GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr,
+ GV->getThreadLocalMode(), GV->getType()->getAddressSpace());
+ NewGV->copyAttributesFrom(GV);
+ for (Use &U : make_early_inc_range(GV->uses())) {
+ if (auto *I = dyn_cast<Instruction>(U.getUser())) {
+ Function *F = I->getFunction();
+ if (!isKernel(*F) || F == KF) {
+ U.getUser()->replaceUsesOfWith(GV, NewGV);
+ }
+ }
+ }
+ return NewGV;
+}
+
+// Write the specified address into metadata where it can be retrieved by
+// the assembler. Format is a half open range, [Address Address+1)
+static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV,
+ uint32_t Address) {
+ LLVMContext &Ctx = M->getContext();
+ auto *IntTy = M->getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
+ auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address));
+ auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address + 1));
+ GV->setMetadata(LLVMContext::MD_absolute_symbol,
+ MDNode::get(Ctx, {MinC, MaxC}));
+}
+
+template <typename T> SmallVector<T> sortByName(SmallVector<T> &&V) {
+ sort(V, [](const auto *L, const auto *R) {
+ return L->getName() < R->getName();
+ });
+ return {std::move(V)};
+}
+
+// Main utility function for special LDS variables lowering.
+static bool lowerExecSyncGlobalVariables(
+ Module &M, LDSUsesInfoTy &LDSUsesInfo,
+ VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
+ bool Changed = false;
+ const DataLayout &DL = M.getDataLayout();
+ // The 1st round: give module-absolute assignments
+ int NumAbsolutes = 0;
+ SmallVector<GlobalVariable *> OrderedGVs;
+ for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
+ GlobalVariable *GV = K.first;
+ if (!isNamedBarrier(*GV))
+ continue;
+ // give a module-absolute assignment if it is indirectly accessed by
+ // multiple kernels. This is not precise, but we don't want to duplicate
+ // a function when it is called by multiple kernels.
+ if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) {
+ OrderedGVs.push_back(GV);
+ } else {
+ // leave it to the 2nd round, which will give a kernel-relative
+ // assignment if it is only indirectly accessed by one kernel
+ LDSUsesInfo.direct_access[*K.second.begin()].insert(GV);
+ }
+ LDSToKernelsThatNeedToAccessItIndirectly.erase(GV);
+ }
+ OrderedGVs = sortByName(std::move(OrderedGVs));
+ for (GlobalVariable *GV : OrderedGVs) {
+ unsigned BarrierScope = AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
+ unsigned BarId = NumAbsolutes + 1;
+ unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
+ NumAbsolutes += BarCnt;
+
+ // 4 bits for alignment, 5 bits for the barrier num,
+ // 3 bits for the barrier scope
+ unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
+ recordLDSAbsoluteAddress(&M, GV, Offset);
+ }
+ OrderedGVs.clear();
+
+ // The 2nd round: give a kernel-relative assignment for GV that
+ // either only indirectly accessed by single kernel or only directly
+ // accessed by multiple kernels.
+ SmallVector<Function *> OrderedKernels;
+ for (auto &K : LDSUsesInfo.direct_access) {
+ Function *F = K.first;
+ assert(isKernel(*F));
+ OrderedKernels.push_back(F);
+ }
+ OrderedKernels = sortByName(std::move(OrderedKernels));
+
+ DenseMap<Function *, uint32_t> Kernel2BarId;
+ for (Function *F : OrderedKernels) {
+ for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) {
+ if (!isNamedBarrier(*GV))
+ continue;
+
+ LDSUsesInfo.direct_access[F].erase(GV);
+ if (GV->isAbsoluteSymbolRef()) {
+ // already assigned
+ continue;
+ }
+ OrderedGVs.push_back(GV);
+ }
+ OrderedGVs = sortByName(std::move(OrderedGVs));
+ for (GlobalVariable *GV : OrderedGVs) {
+ // GV could also be used directly by other kernels. If so, we need to
+ // create a new GV used only by this kernel and its function.
+ auto NewGV = uniquifyGVPerKernel(M, GV, F);
+ Changed |= (NewGV != GV);
+ unsigned BarrierScope = AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
+ unsigned BarId = Kernel2BarId[F];
+ BarId += NumAbsolutes + 1;
+ unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
+ Kernel2BarId[F] += BarCnt;
+ unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
+ recordLDSAbsoluteAddress(&M, NewGV, Offset);
+ }
+ OrderedGVs.clear();
+ }
+ // Also erase those special LDS variables from indirect_access.
+ for (auto &K : LDSUsesInfo.indirect_access) {
+ assert(isKernel(*K.first));
+ for (GlobalVariable *GV : K.second) {
+ if (isNamedBarrier(*GV))
+ K.second.erase(GV);
+ }
+ }
+ return Changed;
+}
+
+static bool runLowerExecSyncGlobals(Module &M) {
+ CallGraph CG = CallGraph(M);
+ bool Changed = false;
+ Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M);
+
+ // For each kernel, what variables does it access directly or through
+ // callees
+ LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M);
+
+ // For each variable accessed through callees, which kernels access it
+ VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly;
+ for (auto &K : LDSUsesInfo.indirect_access) {
+ Function *F = K.first;
+ assert(isKernel(*F));
+ for (GlobalVariable *GV : K.second) {
+ LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(F);
+ }
+ }
+
+ if (LDSUsesInfo.HasSpecialGVs) {
+ // Special LDS variables need special address assignment
+ Changed |= lowerExecSyncGlobalVariables(
+ M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly);
+ }
+ return Changed;
+}
+
+class AMDGPULowerExecSyncLegacy : public ModulePass {
+public:
+ static char ID;
+ AMDGPULowerExecSyncLegacy() : ModulePass(ID) {}
+ bool runOnModule(Module &M) override;
+};
+
+} // namespace
+
+char AMDGPULowerExecSyncLegacy::ID = 0;
+char &llvm::AMDGPULowerExecSyncLegacyPassID = AMDGPULowerExecSyncLegacy::ID;
+
+INITIALIZE_PASS_BEGIN(AMDGPULowerExecSyncLegacy, DEBUG_TYPE,
+ "AMDGPU lowering of execution synchronization", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(AMDGPULowerExecSyncLegacy, DEBUG_TYPE,
+ "AMDGPU lowering of execution synchronization", false,
+ false)
+
+bool AMDGPULowerExecSyncLegacy::runOnModule(Module &M) {
+ return runLowerExecSyncGlobals(M);
+}
+
+ModulePass *llvm::createAMDGPULowerExecSyncLegacyPass() {
+ return new AMDGPULowerExecSyncLegacy();
+}
+
+PreservedAnalyses AMDGPULowerExecSyncPass::run(Module &M,
+ ModuleAnalysisManager &AM) {
+ return runLowerExecSyncGlobals(M) ? PreservedAnalyses::none()
+ : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index dec781d..755b44c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -64,7 +64,7 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
return false;
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
- LLVMContext &Ctx = F.getParent()->getContext();
+ LLVMContext &Ctx = F.getContext();
const DataLayout &DL = F.getDataLayout();
BasicBlock &EntryBlock = *F.begin();
IRBuilder<> Builder(&EntryBlock, getInsertPt(EntryBlock));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
index fed7a13..248d7dc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
@@ -98,7 +98,7 @@ static void annotateGridSizeLoadWithRangeMD(LoadInst *Load,
}
static bool processUse(CallInst *CI, bool IsV5OrAbove) {
- Function *F = CI->getParent()->getParent();
+ Function *F = CI->getFunction();
auto *MD = F->getMetadata("reqd_work_group_size");
const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index a4ef524..be30128 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -441,7 +441,7 @@ public:
return KernelSet;
for (Function &Func : M.functions()) {
- if (Func.isDeclaration() || !isKernelLDS(&Func))
+ if (Func.isDeclaration() || !isKernel(Func))
continue;
for (GlobalVariable *GV : LDSUsesInfo.indirect_access[&Func]) {
if (VariableSet.contains(GV)) {
@@ -555,7 +555,7 @@ public:
for (Function &Func : M->functions()) {
if (Func.isDeclaration())
continue;
- if (!isKernelLDS(&Func))
+ if (!isKernel(Func))
continue;
if (KernelsThatAllocateTableLDS.contains(&Func) ||
@@ -703,7 +703,7 @@ public:
return false;
}
Function *F = I->getFunction();
- return !isKernelLDS(F);
+ return !isKernel(*F);
});
// Replace uses of module scope variable from kernel functions that
@@ -711,7 +711,7 @@ public:
// Record on each kernel whether the module scope global is used by it
for (Function &Func : M.functions()) {
- if (Func.isDeclaration() || !isKernelLDS(&Func))
+ if (Func.isDeclaration() || !isKernel(Func))
continue;
if (KernelsThatAllocateModuleLDS.contains(&Func)) {
@@ -743,7 +743,7 @@ public:
DenseMap<Function *, LDSVariableReplacement> KernelToReplacement;
for (Function &Func : M.functions()) {
- if (Func.isDeclaration() || !isKernelLDS(&Func))
+ if (Func.isDeclaration() || !isKernel(Func))
continue;
DenseSet<GlobalVariable *> KernelUsedVariables;
@@ -828,7 +828,7 @@ public:
// semantics. Setting the alignment here allows this IR pass to accurately
// predict the exact constant at which it will be allocated.
- assert(isKernelLDS(func));
+ assert(isKernel(*func));
LLVMContext &Ctx = M.getContext();
const DataLayout &DL = M.getDataLayout();
@@ -878,7 +878,7 @@ public:
for (auto &func : OrderedKernels) {
if (KernelsThatIndirectlyAllocateDynamicLDS.contains(func)) {
- assert(isKernelLDS(func));
+ assert(isKernel(*func));
if (!func->hasName()) {
reportFatalUsageError("anonymous kernels cannot use LDS variables");
}
@@ -912,7 +912,7 @@ public:
auto *I = dyn_cast<Instruction>(U.getUser());
if (!I)
continue;
- if (isKernelLDS(I->getFunction()))
+ if (isKernel(*I->getFunction()))
continue;
replaceUseWithTableLookup(M, Builder, table, GV, U, nullptr);
@@ -922,126 +922,6 @@ public:
return KernelToCreatedDynamicLDS;
}
- static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
- Function *KF) {
- bool NeedsReplacement = false;
- for (Use &U : GV->uses()) {
- if (auto *I = dyn_cast<Instruction>(U.getUser())) {
- Function *F = I->getFunction();
- if (isKernelLDS(F) && F != KF) {
- NeedsReplacement = true;
- break;
- }
- }
- }
- if (!NeedsReplacement)
- return GV;
- // Create a new GV used only by this kernel and its function
- GlobalVariable *NewGV = new GlobalVariable(
- M, GV->getValueType(), GV->isConstant(), GV->getLinkage(),
- GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr,
- GV->getThreadLocalMode(), GV->getType()->getAddressSpace());
- NewGV->copyAttributesFrom(GV);
- for (Use &U : make_early_inc_range(GV->uses())) {
- if (auto *I = dyn_cast<Instruction>(U.getUser())) {
- Function *F = I->getFunction();
- if (!isKernelLDS(F) || F == KF) {
- U.getUser()->replaceUsesOfWith(GV, NewGV);
- }
- }
- }
- return NewGV;
- }
-
- bool lowerSpecialLDSVariables(
- Module &M, LDSUsesInfoTy &LDSUsesInfo,
- VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
- bool Changed = false;
- const DataLayout &DL = M.getDataLayout();
- // The 1st round: give module-absolute assignments
- int NumAbsolutes = 0;
- std::vector<GlobalVariable *> OrderedGVs;
- for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
- GlobalVariable *GV = K.first;
- if (!isNamedBarrier(*GV))
- continue;
- // give a module-absolute assignment if it is indirectly accessed by
- // multiple kernels. This is not precise, but we don't want to duplicate
- // a function when it is called by multiple kernels.
- if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) {
- OrderedGVs.push_back(GV);
- } else {
- // leave it to the 2nd round, which will give a kernel-relative
- // assignment if it is only indirectly accessed by one kernel
- LDSUsesInfo.direct_access[*K.second.begin()].insert(GV);
- }
- LDSToKernelsThatNeedToAccessItIndirectly.erase(GV);
- }
- OrderedGVs = sortByName(std::move(OrderedGVs));
- for (GlobalVariable *GV : OrderedGVs) {
- unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
- unsigned BarId = NumAbsolutes + 1;
- unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
- NumAbsolutes += BarCnt;
-
- // 4 bits for alignment, 5 bits for the barrier num,
- // 3 bits for the barrier scope
- unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
- recordLDSAbsoluteAddress(&M, GV, Offset);
- }
- OrderedGVs.clear();
-
- // The 2nd round: give a kernel-relative assignment for GV that
- // either only indirectly accessed by single kernel or only directly
- // accessed by multiple kernels.
- std::vector<Function *> OrderedKernels;
- for (auto &K : LDSUsesInfo.direct_access) {
- Function *F = K.first;
- assert(isKernelLDS(F));
- OrderedKernels.push_back(F);
- }
- OrderedKernels = sortByName(std::move(OrderedKernels));
-
- llvm::DenseMap<Function *, uint32_t> Kernel2BarId;
- for (Function *F : OrderedKernels) {
- for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) {
- if (!isNamedBarrier(*GV))
- continue;
-
- LDSUsesInfo.direct_access[F].erase(GV);
- if (GV->isAbsoluteSymbolRef()) {
- // already assigned
- continue;
- }
- OrderedGVs.push_back(GV);
- }
- OrderedGVs = sortByName(std::move(OrderedGVs));
- for (GlobalVariable *GV : OrderedGVs) {
- // GV could also be used directly by other kernels. If so, we need to
- // create a new GV used only by this kernel and its function.
- auto NewGV = uniquifyGVPerKernel(M, GV, F);
- Changed |= (NewGV != GV);
- unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
- unsigned BarId = Kernel2BarId[F];
- BarId += NumAbsolutes + 1;
- unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
- Kernel2BarId[F] += BarCnt;
- unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
- recordLDSAbsoluteAddress(&M, NewGV, Offset);
- }
- OrderedGVs.clear();
- }
- // Also erase those special LDS variables from indirect_access.
- for (auto &K : LDSUsesInfo.indirect_access) {
- assert(isKernelLDS(K.first));
- for (GlobalVariable *GV : K.second) {
- if (isNamedBarrier(*GV))
- K.second.erase(GV);
- }
- }
- return Changed;
- }
-
bool runOnModule(Module &M) {
CallGraph CG = CallGraph(M);
bool Changed = superAlignLDSGlobals(M);
@@ -1058,18 +938,12 @@ public:
VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly;
for (auto &K : LDSUsesInfo.indirect_access) {
Function *F = K.first;
- assert(isKernelLDS(F));
+ assert(isKernel(*F));
for (GlobalVariable *GV : K.second) {
LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(F);
}
}
- if (LDSUsesInfo.HasSpecialGVs) {
- // Special LDS variables need special address assignment
- Changed |= lowerSpecialLDSVariables(
- M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly);
- }
-
// Partition variables accessed indirectly into the different strategies
DenseSet<GlobalVariable *> ModuleScopeVariables;
DenseSet<GlobalVariable *> TableLookupVariables;
@@ -1157,7 +1031,7 @@ public:
const DataLayout &DL = M.getDataLayout();
for (Function &Func : M.functions()) {
- if (Func.isDeclaration() || !isKernelLDS(&Func))
+ if (Func.isDeclaration() || !isKernel(Func))
continue;
// All three of these are optional. The first variable is allocated at
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
index 1e6589e..d7d0292 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
@@ -58,6 +58,8 @@ class AMDGPULowerVGPREncoding {
static constexpr unsigned BitsPerField = 2;
static constexpr unsigned NumFields = 4;
static constexpr unsigned FieldMask = (1 << BitsPerField) - 1;
+ static constexpr unsigned ModeWidth = NumFields * BitsPerField;
+ static constexpr unsigned ModeMask = (1 << ModeWidth) - 1;
using ModeType = PackedVector<unsigned, BitsPerField,
std::bitset<BitsPerField * NumFields>>;
@@ -82,12 +84,12 @@ private:
const SIInstrInfo *TII;
const SIRegisterInfo *TRI;
+ // Current basic block.
+ MachineBasicBlock *MBB;
+
/// Most recent s_set_* instruction.
MachineInstr *MostRecentModeSet;
- /// Whether the current mode is known.
- bool CurrentModeKnown;
-
/// Current mode bits.
ModeTy CurrentMode;
@@ -108,10 +110,13 @@ private:
MachineInstr *Clause;
/// Insert mode change before \p I. \returns true if mode was changed.
- bool setMode(ModeTy NewMode, ModeTy Mask, MachineInstr *I);
+ bool setMode(ModeTy NewMode, ModeTy Mask,
+ MachineBasicBlock::instr_iterator I);
/// Reset mode to default.
- void resetMode(MachineInstr *I) { setMode(ModeTy(), ModeTy::fullMask(), I); }
+ void resetMode(MachineBasicBlock::instr_iterator I) {
+ setMode(ModeTy(), ModeTy::fullMask(), I);
+ }
/// If \p MO references VGPRs, return the MSBs. Otherwise, return nullopt.
std::optional<unsigned> getMSBs(const MachineOperand &MO) const;
@@ -130,38 +135,43 @@ private:
/// Check if an instruction \p I is within a clause and returns a suitable
/// iterator to insert mode change. It may also modify the S_CLAUSE
/// instruction to extend it or drop the clause if it cannot be adjusted.
- MachineInstr *handleClause(MachineInstr *I);
+ MachineBasicBlock::instr_iterator
+ handleClause(MachineBasicBlock::instr_iterator I);
};
bool AMDGPULowerVGPREncoding::setMode(ModeTy NewMode, ModeTy Mask,
- MachineInstr *I) {
+ MachineBasicBlock::instr_iterator I) {
assert((NewMode.raw_bits() & ~Mask.raw_bits()).none());
- if (CurrentModeKnown) {
- auto Delta = NewMode.raw_bits() ^ CurrentMode.raw_bits();
+ auto Delta = NewMode.raw_bits() ^ CurrentMode.raw_bits();
- if ((Delta & Mask.raw_bits()).none()) {
- CurrentMask |= Mask;
- return false;
- }
+ if ((Delta & Mask.raw_bits()).none()) {
+ CurrentMask |= Mask;
+ return false;
+ }
- if (MostRecentModeSet && (Delta & CurrentMask.raw_bits()).none()) {
- CurrentMode |= NewMode;
- CurrentMask |= Mask;
+ if (MostRecentModeSet && (Delta & CurrentMask.raw_bits()).none()) {
+ CurrentMode |= NewMode;
+ CurrentMask |= Mask;
- MostRecentModeSet->getOperand(0).setImm(CurrentMode);
- return true;
- }
+ MachineOperand &Op = MostRecentModeSet->getOperand(0);
+
+ // Carry old mode bits from the existing instruction.
+ int64_t OldModeBits = Op.getImm() & (ModeMask << ModeWidth);
+
+ Op.setImm(CurrentMode | OldModeBits);
+ return true;
}
+ // Record previous mode into high 8 bits of the immediate.
+ int64_t OldModeBits = CurrentMode << ModeWidth;
+
I = handleClause(I);
- MostRecentModeSet =
- BuildMI(*I->getParent(), I, {}, TII->get(AMDGPU::S_SET_VGPR_MSB))
- .addImm(NewMode);
+ MostRecentModeSet = BuildMI(*MBB, I, {}, TII->get(AMDGPU::S_SET_VGPR_MSB))
+ .addImm(NewMode | OldModeBits);
CurrentMode = NewMode;
CurrentMask = Mask;
- CurrentModeKnown = true;
return true;
}
@@ -233,21 +243,22 @@ bool AMDGPULowerVGPREncoding::runOnMachineInstr(MachineInstr &MI) {
if (Ops.first) {
ModeTy NewMode, Mask;
computeMode(NewMode, Mask, MI, Ops.first, Ops.second);
- return setMode(NewMode, Mask, &MI);
+ return setMode(NewMode, Mask, MI.getIterator());
}
assert(!TII->hasVGPRUses(MI) || MI.isMetaInstruction() || MI.isPseudo());
return false;
}
-MachineInstr *AMDGPULowerVGPREncoding::handleClause(MachineInstr *I) {
+MachineBasicBlock::instr_iterator
+AMDGPULowerVGPREncoding::handleClause(MachineBasicBlock::instr_iterator I) {
if (!ClauseRemaining)
return I;
// A clause cannot start with a special instruction, place it right before
// the clause.
if (ClauseRemaining == ClauseLen) {
- I = Clause->getPrevNode();
+ I = Clause->getPrevNode()->getIterator();
assert(I->isBundle());
return I;
}
@@ -284,9 +295,9 @@ bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) {
ClauseLen = ClauseRemaining = 0;
CurrentMode.reset();
CurrentMask.reset();
- CurrentModeKnown = true;
for (auto &MBB : MF) {
MostRecentModeSet = nullptr;
+ this->MBB = &MBB;
for (auto &MI : llvm::make_early_inc_range(MBB.instrs())) {
if (MI.isMetaInstruction())
@@ -294,17 +305,16 @@ bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) {
if (MI.isTerminator() || MI.isCall()) {
if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
- MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
+ MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED)
CurrentMode.reset();
- CurrentModeKnown = true;
- } else
- resetMode(&MI);
+ else
+ resetMode(MI.getIterator());
continue;
}
if (MI.isInlineAsm()) {
if (TII->hasVGPRUses(MI))
- resetMode(&MI);
+ resetMode(MI.getIterator());
continue;
}
@@ -323,14 +333,8 @@ bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) {
--ClauseRemaining;
}
- // If we're falling through to a block that has at least one other
- // predecessor, we no longer know the mode.
- MachineBasicBlock *Next = MBB.getNextNode();
- if (Next && Next->pred_size() >= 2 &&
- llvm::is_contained(Next->predecessors(), &MBB)) {
- if (CurrentMode.raw_bits().any())
- CurrentModeKnown = false;
- }
+ // Reset the mode if we are falling through.
+ resetMode(MBB.instr_end());
}
return Changed;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 680e7eb..bf9b429 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -229,7 +229,8 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
OutMI.addOperand(Src);
return;
} else if (Opcode == AMDGPU::SI_TCRETURN ||
- Opcode == AMDGPU::SI_TCRETURN_GFX) {
+ Opcode == AMDGPU::SI_TCRETURN_GFX ||
+ Opcode == AMDGPU::SI_TCRETURN_CHAIN) {
// TODO: How to use branch immediate and avoid register+add?
Opcode = AMDGPU::S_SETPC_B64;
} else if (AMDGPU::getT16D16Helper(Opcode)) {
@@ -243,7 +244,7 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
int MCOpcode = TII->pseudoToMCOpcode(Opcode);
if (MCOpcode == -1) {
- LLVMContext &C = MI->getParent()->getParent()->getFunction().getContext();
+ LLVMContext &C = MI->getMF()->getFunction().getContext();
C.emitError("AMDGPUMCInstLower::lower - Pseudo instruction doesn't have "
"a target-specific version: " + Twine(MI->getOpcode()));
}
@@ -332,7 +333,7 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
StringRef Err;
if (!STI.getInstrInfo()->verifyInstruction(*MI, Err)) {
- LLVMContext &C = MI->getParent()->getParent()->getFunction().getContext();
+ LLVMContext &C = MI->getMF()->getFunction().getContext();
C.emitError("Illegal instruction detected: " + Err);
MI->print(errs());
}
@@ -412,7 +413,7 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
*OutStreamer);
if (isVerbose() && MI->getOpcode() == AMDGPU::S_SET_VGPR_MSB) {
- unsigned V = MI->getOperand(0).getImm();
+ unsigned V = MI->getOperand(0).getImm() & 0xff;
OutStreamer->AddComment(
" msbs: dst=" + Twine(V >> 6) + " src0=" + Twine(V & 3) +
" src1=" + Twine((V >> 2) & 3) + " src2=" + Twine((V >> 4) & 3));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
index e17c211..8145816 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
@@ -126,7 +126,7 @@ void getUsesOfLDSByFunction(const CallGraph &CG, Module &M,
for (User *V : GV.users()) {
if (auto *I = dyn_cast<Instruction>(V)) {
Function *F = I->getFunction();
- if (isKernelLDS(F))
+ if (isKernel(*F))
kernels[F].insert(&GV);
else
Functions[F].insert(&GV);
@@ -135,10 +135,6 @@ void getUsesOfLDSByFunction(const CallGraph &CG, Module &M,
}
}
-bool isKernelLDS(const Function *F) {
- return AMDGPU::isKernel(F->getCallingConv());
-}
-
LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
FunctionVariableMap DirectMapKernel;
@@ -148,7 +144,7 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
// Collect functions whose address has escaped
DenseSet<Function *> AddressTakenFuncs;
for (Function &F : M.functions()) {
- if (!isKernelLDS(&F))
+ if (!isKernel(F))
if (F.hasAddressTaken(nullptr,
/* IgnoreCallbackUses */ false,
/* IgnoreAssumeLikeCalls */ false,
@@ -180,7 +176,7 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
// access all variables accessed by functions whose address escaped
for (Function &F : M.functions()) {
if (!F.isDeclaration() && FunctionMakesUnknownCall(&F)) {
- if (!isKernelLDS(&F)) {
+ if (!isKernel(F)) {
set_union(TransitiveMapFunction[&F],
VariablesReachableThroughFunctionPointer);
}
@@ -190,7 +186,7 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
// Direct implementation of collecting all variables reachable from each
// function
for (Function &Func : M.functions()) {
- if (Func.isDeclaration() || isKernelLDS(&Func))
+ if (Func.isDeclaration() || isKernel(Func))
continue;
DenseSet<Function *> seen; // catches cycles
@@ -227,7 +223,7 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
FunctionVariableMap IndirectMapKernel;
for (Function &Func : M.functions()) {
- if (Func.isDeclaration() || !isKernelLDS(&Func))
+ if (Func.isDeclaration() || !isKernel(Func))
continue;
for (const CallGraphNode::CallRecord &R : *CG[&Func]) {
@@ -273,6 +269,8 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
// this is a re-run of the pass
// so we don't have anything to do.
// - No variables are absolute.
+ // Named-barriers which are absolute symbols are removed
+ // from the maps.
std::optional<bool> HasAbsoluteGVs;
bool HasSpecialGVs = false;
for (auto &Map : {DirectMapKernel, IndirectMapKernel}) {
@@ -284,6 +282,10 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
if (IsDirectMapDynLDSGV)
continue;
if (isNamedBarrier(*GV)) {
+ if (IsAbsolute) {
+ DirectMapKernel[Fn].erase(GV);
+ IndirectMapKernel[Fn].erase(GV);
+ }
HasSpecialGVs = true;
continue;
}
@@ -335,7 +337,7 @@ void removeFnAttrFromReachable(CallGraph &CG, Function *KernelRoot,
Function *PotentialCallee =
ExternalCallRecord.second->getFunction();
assert(PotentialCallee);
- if (!isKernelLDS(PotentialCallee)) {
+ if (!isKernel(*PotentialCallee)) {
for (StringRef Attr : FnAttrs)
PotentialCallee->removeFnAttr(Attr);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h
index 058e744..8868b93 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h
@@ -53,8 +53,6 @@ void getUsesOfLDSByFunction(const CallGraph &CG, Module &M,
FunctionVariableMap &kernels,
FunctionVariableMap &functions);
-bool isKernelLDS(const Function *F);
-
LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M);
/// Strip FnAttr attribute from any functions where we may have
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index bf6f1a9..f464fbf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -13,6 +13,12 @@
// NOTE: NO INCLUDE GUARD DESIRED!
+#ifndef MODULE_ANALYSIS
+#define MODULE_ANALYSIS(NAME, CREATE_PASS)
+#endif
+MODULE_ANALYSIS("amdgpu-argument-usage", AMDGPUArgumentUsageAnalysis())
+#undef MODULE_ANALYSIS
+
#ifndef MODULE_PASS
#define MODULE_PASS(NAME, CREATE_PASS)
#endif
@@ -29,6 +35,7 @@ MODULE_PASS("amdgpu-perf-hint",
MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass(*this))
MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this))
+MODULE_PASS("amdgpu-lower-exec-sync", AMDGPULowerExecSyncPass())
MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this))
#undef MODULE_PASS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
index cf2ab825..a3be0f5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
@@ -48,7 +48,7 @@ private:
FuncInfoMap FIM;
public:
- AMDGPUPerfHintAnalysis() {}
+ AMDGPUPerfHintAnalysis() = default;
// OldPM
bool runOnSCC(const GCNTargetMachine &TM, CallGraphSCC &SCC);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
index 6e54737..4a70c5d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
@@ -177,8 +177,7 @@ void AMDGPUPreLegalizerCombinerImpl::applyClampI64ToI16(
MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) const {
Register Src = MatchInfo.Origin;
- assert(MI.getParent()->getParent()->getRegInfo().getType(Src) ==
- LLT::scalar(64));
+ assert(MI.getMF()->getRegInfo().getType(Src) == LLT::scalar(64));
const LLT S32 = LLT::scalar(32);
auto Unmerge = B.buildUnmerge(S32, Src);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp
index ffbbf63..7d6e3ed 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp
@@ -127,7 +127,7 @@ private:
// will also be preloaded even if that data is unused.
Function *cloneFunctionWithPreloadImplicitArgs(unsigned LastPreloadIndex) {
FunctionType *FT = F.getFunctionType();
- LLVMContext &Ctx = F.getParent()->getContext();
+ LLVMContext &Ctx = F.getContext();
SmallVector<Type *, 16> FTypes(FT->param_begin(), FT->param_end());
for (unsigned I = 0; I <= LastPreloadIndex; ++I)
FTypes.push_back(getHiddenArgType(Ctx, HiddenArg(I)));
@@ -196,7 +196,7 @@ public:
SmallVector<std::pair<LoadInst *, unsigned>, 4> ImplicitArgLoads;
for (auto *U : ImplicitArgPtr->users()) {
Instruction *CI = dyn_cast<Instruction>(U);
- if (!CI || CI->getParent()->getParent() != &F)
+ if (!CI || CI->getFunction() != &F)
continue;
for (auto *U : CI->users()) {
@@ -213,7 +213,7 @@ public:
continue;
// FIXME: Expand handle merged loads.
- LLVMContext &Ctx = F.getParent()->getContext();
+ LLVMContext &Ctx = F.getContext();
Type *LoadTy = Load->getType();
HiddenArg HA = getHiddenArgFromOffset(Offset);
if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType(Ctx, HA))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
index f5e14c7..416de90 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
@@ -129,7 +129,7 @@ static StringRef getAsConstantStr(Value *V) {
static void diagnoseInvalidFormatString(const CallBase *CI) {
CI->getContext().diagnose(DiagnosticInfoUnsupported(
- *CI->getParent()->getParent(),
+ *CI->getFunction(),
"printf format string must be a trivially resolved constant string "
"global variable",
CI->getDebugLoc()));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index ddabd25..b79689c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -122,6 +122,7 @@ private:
/// Check whether we have enough local memory for promotion.
bool hasSufficientLocalMem(const Function &F);
+ FixedVectorType *getVectorTypeForAlloca(Type *AllocaTy) const;
bool tryPromoteAllocaToVector(AllocaInst &I);
bool tryPromoteAllocaToLDS(AllocaInst &I, bool SufficientLDS);
@@ -460,13 +461,15 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
return nullptr;
Value *Offset = VarOffset.first;
- auto *OffsetType = dyn_cast<IntegerType>(Offset->getType());
- if (!OffsetType)
+ if (!isa<IntegerType>(Offset->getType()))
return nullptr;
+ Offset = Builder.CreateSExtOrTrunc(Offset, Builder.getIntNTy(BW));
+ if (Offset != VarOffset.first)
+ NewInsts.push_back(cast<Instruction>(Offset));
+
if (!OffsetQuot.isOne()) {
- ConstantInt *ConstMul =
- ConstantInt::get(Ctx, OffsetQuot.sext(OffsetType->getBitWidth()));
+ ConstantInt *ConstMul = ConstantInt::get(Ctx, OffsetQuot.sextOrTrunc(BW));
Offset = Builder.CreateMul(Offset, ConstMul);
if (Instruction *NewInst = dyn_cast<Instruction>(Offset))
NewInsts.push_back(NewInst);
@@ -474,8 +477,7 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
if (ConstOffset.isZero())
return Offset;
- ConstantInt *ConstIndex =
- ConstantInt::get(Ctx, IndexQuot.sext(OffsetType->getBitWidth()));
+ ConstantInt *ConstIndex = ConstantInt::get(Ctx, IndexQuot.sextOrTrunc(BW));
Value *IndexAdd = Builder.CreateAdd(Offset, ConstIndex);
if (Instruction *NewInst = dyn_cast<Instruction>(IndexAdd))
NewInsts.push_back(NewInst);
@@ -501,27 +503,14 @@ static Value *promoteAllocaUserToVector(
Instruction *Inst, const DataLayout &DL, FixedVectorType *VectorTy,
unsigned VecStoreSize, unsigned ElementSize,
DenseMap<MemTransferInst *, MemTransferInfo> &TransferInfo,
- std::map<GetElementPtrInst *, WeakTrackingVH> &GEPVectorIdx, Value *CurVal,
- SmallVectorImpl<LoadInst *> &DeferredLoads) {
+ std::map<GetElementPtrInst *, WeakTrackingVH> &GEPVectorIdx,
+ function_ref<Value *()> GetCurVal) {
// Note: we use InstSimplifyFolder because it can leverage the DataLayout
// to do more folding, especially in the case of vector splats.
IRBuilder<InstSimplifyFolder> Builder(Inst->getContext(),
InstSimplifyFolder(DL));
Builder.SetInsertPoint(Inst);
- const auto GetOrLoadCurrentVectorValue = [&]() -> Value * {
- if (CurVal)
- return CurVal;
-
- // If the current value is not known, insert a dummy load and lower it on
- // the second pass.
- LoadInst *Dummy =
- Builder.CreateLoad(VectorTy, PoisonValue::get(Builder.getPtrTy()),
- "promotealloca.dummyload");
- DeferredLoads.push_back(Dummy);
- return Dummy;
- };
-
const auto CreateTempPtrIntCast = [&Builder, DL](Value *Val,
Type *PtrTy) -> Value * {
assert(DL.getTypeStoreSize(Val->getType()) == DL.getTypeStoreSize(PtrTy));
@@ -541,12 +530,7 @@ static Value *promoteAllocaUserToVector(
switch (Inst->getOpcode()) {
case Instruction::Load: {
- // Loads can only be lowered if the value is known.
- if (!CurVal) {
- DeferredLoads.push_back(cast<LoadInst>(Inst));
- return nullptr;
- }
-
+ Value *CurVal = GetCurVal();
Value *Index = calculateVectorIndex(
cast<LoadInst>(Inst)->getPointerOperand(), GEPVectorIdx);
@@ -636,7 +620,7 @@ static Value *promoteAllocaUserToVector(
Val = Builder.CreateBitOrPointerCast(Val, SubVecTy);
- Value *CurVec = GetOrLoadCurrentVectorValue();
+ Value *CurVec = GetCurVal();
for (unsigned K = 0, NumElts = std::min(NumWrittenElts, NumVecElts);
K < NumElts; ++K) {
Value *CurIdx =
@@ -649,8 +633,7 @@ static Value *promoteAllocaUserToVector(
if (Val->getType() != VecEltTy)
Val = Builder.CreateBitOrPointerCast(Val, VecEltTy);
- return Builder.CreateInsertElement(GetOrLoadCurrentVectorValue(), Val,
- Index);
+ return Builder.CreateInsertElement(GetCurVal(), Val, Index);
}
case Instruction::Call: {
if (auto *MTI = dyn_cast<MemTransferInst>(Inst)) {
@@ -672,7 +655,7 @@ static Value *promoteAllocaUserToVector(
}
}
- return Builder.CreateShuffleVector(GetOrLoadCurrentVectorValue(), Mask);
+ return Builder.CreateShuffleVector(GetCurVal(), Mask);
}
if (auto *MSI = dyn_cast<MemSetInst>(Inst)) {
@@ -791,16 +774,13 @@ static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB,
return I;
}
-// FIXME: Should try to pick the most likely to be profitable allocas first.
-bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
- LLVM_DEBUG(dbgs() << "Trying to promote to vector: " << Alloca << '\n');
-
+FixedVectorType *
+AMDGPUPromoteAllocaImpl::getVectorTypeForAlloca(Type *AllocaTy) const {
if (DisablePromoteAllocaToVector) {
- LLVM_DEBUG(dbgs() << " Promote alloca to vector is disabled\n");
- return false;
+ LLVM_DEBUG(dbgs() << " Promote alloca to vectors is disabled\n");
+ return nullptr;
}
- Type *AllocaTy = Alloca.getAllocatedType();
auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
uint64_t NumElems = 1;
@@ -832,10 +812,9 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
}
}
}
-
if (!VectorTy) {
LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");
- return false;
+ return nullptr;
}
const unsigned MaxElements =
@@ -845,9 +824,29 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
VectorTy->getNumElements() < 2) {
LLVM_DEBUG(dbgs() << " " << *VectorTy
<< " has an unsupported number of elements\n");
- return false;
+ return nullptr;
}
+ Type *VecEltTy = VectorTy->getElementType();
+ unsigned ElementSizeInBits = DL->getTypeSizeInBits(VecEltTy);
+ if (ElementSizeInBits != DL->getTypeAllocSizeInBits(VecEltTy)) {
+ LLVM_DEBUG(dbgs() << " Cannot convert to vector if the allocation size "
+ "does not match the type's size\n");
+ return nullptr;
+ }
+
+ return VectorTy;
+}
+
+// FIXME: Should try to pick the most likely to be profitable allocas first.
+bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
+ LLVM_DEBUG(dbgs() << "Trying to promote to vectors: " << Alloca << '\n');
+
+ Type *AllocaTy = Alloca.getAllocatedType();
+ FixedVectorType *VectorTy = getVectorTypeForAlloca(AllocaTy);
+ if (!VectorTy)
+ return false;
+
std::map<GetElementPtrInst *, WeakTrackingVH> GEPVectorIdx;
SmallVector<Instruction *> WorkList;
SmallVector<Instruction *> UsersToRemove;
@@ -869,13 +868,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
LLVM_DEBUG(dbgs() << " Attempting promotion to: " << *VectorTy << "\n");
Type *VecEltTy = VectorTy->getElementType();
- unsigned ElementSizeInBits = DL->getTypeSizeInBits(VecEltTy);
- if (ElementSizeInBits != DL->getTypeAllocSizeInBits(VecEltTy)) {
- LLVM_DEBUG(dbgs() << " Cannot convert to vector if the allocation size "
- "does not match the type's size\n");
- return false;
- }
- unsigned ElementSize = ElementSizeInBits / 8;
+ unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8;
assert(ElementSize > 0);
for (auto *U : Uses) {
Instruction *Inst = cast<Instruction>(U->getUser());
@@ -1027,37 +1020,46 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
Updater.AddAvailableValue(EntryBB, AllocaInitValue);
- // First handle the initial worklist.
- SmallVector<LoadInst *, 4> DeferredLoads;
+ // First handle the initial worklist, in basic block order.
+ //
+ // Insert a placeholder whenever we need the vector value at the top of a
+ // basic block.
+ SmallVector<Instruction *> Placeholders;
forEachWorkListItem(WorkList, [&](Instruction *I) {
BasicBlock *BB = I->getParent();
- // On the first pass, we only take values that are trivially known, i.e.
- // where AddAvailableValue was already called in this block.
- Value *Result = promoteAllocaUserToVector(
- I, *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx,
- Updater.FindValueForBlock(BB), DeferredLoads);
+ auto GetCurVal = [&]() -> Value * {
+ if (Value *CurVal = Updater.FindValueForBlock(BB))
+ return CurVal;
+
+ if (!Placeholders.empty() && Placeholders.back()->getParent() == BB)
+ return Placeholders.back();
+
+ // If the current value in the basic block is not yet known, insert a
+ // placeholder that we will replace later.
+ IRBuilder<> Builder(I);
+ auto *Placeholder = cast<Instruction>(Builder.CreateFreeze(
+ PoisonValue::get(VectorTy), "promotealloca.placeholder"));
+ Placeholders.push_back(Placeholder);
+ return Placeholders.back();
+ };
+
+ Value *Result =
+ promoteAllocaUserToVector(I, *DL, VectorTy, VecStoreSize, ElementSize,
+ TransferInfo, GEPVectorIdx, GetCurVal);
if (Result)
Updater.AddAvailableValue(BB, Result);
});
- // Then handle deferred loads.
- forEachWorkListItem(DeferredLoads, [&](Instruction *I) {
- SmallVector<LoadInst *, 0> NewDLs;
- BasicBlock *BB = I->getParent();
- // On the second pass, we use GetValueInMiddleOfBlock to guarantee we always
- // get a value, inserting PHIs as needed.
- Value *Result = promoteAllocaUserToVector(
- I, *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx,
- Updater.GetValueInMiddleOfBlock(I->getParent()), NewDLs);
- if (Result)
- Updater.AddAvailableValue(BB, Result);
- assert(NewDLs.empty() && "No more deferred loads should be queued!");
- });
+ // Now fixup the placeholders.
+ for (Instruction *Placeholder : Placeholders) {
+ Placeholder->replaceAllUsesWith(
+ Updater.GetValueInMiddleOfBlock(Placeholder->getParent()));
+ Placeholder->eraseFromParent();
+ }
// Delete all instructions. On the first pass, new dummy loads may have been
// added so we need to collect them too.
DenseSet<Instruction *> InstsToDelete(WorkList.begin(), WorkList.end());
- InstsToDelete.insert_range(DeferredLoads);
for (Instruction *I : InstsToDelete) {
assert(I->use_empty());
I->eraseFromParent();
@@ -1378,7 +1380,7 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
auto visitUsers = [&](const GlobalVariable *GV, const Constant *Val) -> bool {
for (const User *U : Val->users()) {
if (const Instruction *Use = dyn_cast<Instruction>(U)) {
- if (Use->getParent()->getParent() == &F)
+ if (Use->getFunction() == &F)
return true;
} else {
const Constant *C = cast<Constant>(U);
@@ -1489,7 +1491,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
const DataLayout &DL = Mod->getDataLayout();
IRBuilder<> Builder(&I);
- const Function &ContainingFunction = *I.getParent()->getParent();
+ const Function &ContainingFunction = *I.getFunction();
CallingConv::ID CC = ContainingFunction.getCallingConv();
// Don't promote the alloca to LDS for shader calling conventions as the work
@@ -1544,7 +1546,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n");
- Function *F = I.getParent()->getParent();
+ Function *F = I.getFunction();
Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize);
GlobalVariable *GV = new GlobalVariable(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
index e187959..839120d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
@@ -24,6 +24,7 @@
#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
@@ -34,9 +35,17 @@
using namespace llvm;
using namespace AMDGPU;
+using namespace llvm::MIPatternMatch;
namespace {
+// AMDGPU-specific pattern matchers
+template <typename SrcTy>
+inline UnaryOp_match<SrcTy, AMDGPU::G_AMDGPU_READANYLANE>
+m_GAMDGPUReadAnyLane(const SrcTy &Src) {
+ return UnaryOp_match<SrcTy, AMDGPU::G_AMDGPU_READANYLANE>(Src);
+}
+
class AMDGPURegBankLegalize : public MachineFunctionPass {
public:
static char ID;
@@ -160,9 +169,25 @@ AMDGPURegBankLegalizeCombiner::tryMatchRALFromUnmerge(Register Src) {
Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) {
// Src = G_AMDGPU_READANYLANE RALSrc
- auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE);
- if (RAL)
+ Register RALSrc;
+ if (mi_match(Src, MRI, m_GAMDGPUReadAnyLane(m_Reg(RALSrc))))
+ return RALSrc;
+
+ // RALSrc = G_ANYEXT S16Src
+ // TruncSrc = G_AMDGPU_READANYLANE RALSrc
+ // Src = G_TRUNC TruncSrc
+ if (mi_match(Src, MRI,
+ m_GTrunc(m_GAMDGPUReadAnyLane(m_GAnyExt(m_Reg(RALSrc)))))) {
return RALSrc;
+ }
+
+ // TruncSrc = G_AMDGPU_READANYLANE RALSrc
+ // AextSrc = G_TRUNC TruncSrc
+ // Src = G_ANYEXT AextSrc
+ if (mi_match(Src, MRI,
+ m_GAnyExt(m_GTrunc(m_GAMDGPUReadAnyLane(m_Reg(RALSrc)))))) {
+ return RALSrc;
+ }
// LoVgpr, HiVgpr = G_UNMERGE_VALUES UnmergeSrc
// LoSgpr = G_AMDGPU_READANYLANE LoVgpr
@@ -410,7 +435,8 @@ bool AMDGPURegBankLegalize::runOnMachineFunction(MachineFunction &MF) {
unsigned Opc = MI->getOpcode();
// Insert point for use operands needs some calculation.
if (Opc == AMDGPU::G_PHI) {
- RBLHelper.applyMappingPHI(*MI);
+ if (!RBLHelper.applyMappingPHI(*MI))
+ return false;
continue;
}
@@ -441,7 +467,8 @@ bool AMDGPURegBankLegalize::runOnMachineFunction(MachineFunction &MF) {
// S1 rules are in RegBankLegalizeRules.
}
- RBLHelper.findRuleAndApplyMapping(*MI);
+ if (!RBLHelper.findRuleAndApplyMapping(*MI))
+ return false;
}
// Sgpr S1 clean up combines:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index 5407566..cc31d7d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -32,28 +32,48 @@ using namespace AMDGPU;
RegBankLegalizeHelper::RegBankLegalizeHelper(
MachineIRBuilder &B, const MachineUniformityInfo &MUI,
const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
- : ST(B.getMF().getSubtarget<GCNSubtarget>()), B(B), MRI(*B.getMRI()),
- MUI(MUI), RBI(RBI), RBLRules(RBLRules), IsWave32(ST.isWave32()),
+ : MF(B.getMF()), ST(MF.getSubtarget<GCNSubtarget>()), B(B),
+ MRI(*B.getMRI()), MUI(MUI), RBI(RBI), MORE(MF, nullptr),
+ RBLRules(RBLRules), IsWave32(ST.isWave32()),
SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}
-void RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) {
- const SetOfRulesForOpcode &RuleSet = RBLRules.getRulesForOpc(MI);
- const RegBankLLTMapping &Mapping = RuleSet.findMappingForMI(MI, MRI, MUI);
+bool RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) {
+ const SetOfRulesForOpcode *RuleSet = RBLRules.getRulesForOpc(MI);
+ if (!RuleSet) {
+ reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
+ "No AMDGPU RegBankLegalize rules defined for opcode",
+ MI);
+ return false;
+ }
+
+ const RegBankLLTMapping *Mapping = RuleSet->findMappingForMI(MI, MRI, MUI);
+ if (!Mapping) {
+ reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
+ "AMDGPU RegBankLegalize: none of the rules defined with "
+ "'Any' for MI's opcode matched MI",
+ MI);
+ return false;
+ }
SmallSet<Register, 4> WaterfallSgprs;
unsigned OpIdx = 0;
- if (Mapping.DstOpMapping.size() > 0) {
+ if (Mapping->DstOpMapping.size() > 0) {
B.setInsertPt(*MI.getParent(), std::next(MI.getIterator()));
- applyMappingDst(MI, OpIdx, Mapping.DstOpMapping);
+ if (!applyMappingDst(MI, OpIdx, Mapping->DstOpMapping))
+ return false;
}
- if (Mapping.SrcOpMapping.size() > 0) {
+ if (Mapping->SrcOpMapping.size() > 0) {
B.setInstr(MI);
- applyMappingSrc(MI, OpIdx, Mapping.SrcOpMapping, WaterfallSgprs);
+ if (!applyMappingSrc(MI, OpIdx, Mapping->SrcOpMapping, WaterfallSgprs))
+ return false;
}
- lower(MI, Mapping, WaterfallSgprs);
+ if (!lower(MI, *Mapping, WaterfallSgprs))
+ return false;
+
+ return true;
}
bool RegBankLegalizeHelper::executeInWaterfallLoop(
@@ -274,7 +294,7 @@ bool RegBankLegalizeHelper::executeInWaterfallLoop(
return true;
}
-void RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
+bool RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
ArrayRef<LLT> LLTBreakdown, LLT MergeTy) {
MachineFunction &MF = B.getMF();
assert(MI.getNumMemOperands() == 1);
@@ -322,9 +342,10 @@ void RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
B.buildMergeLikeInstr(Dst, MergeTyParts);
}
MI.eraseFromParent();
+ return true;
}
-void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
+bool RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
LLT MergeTy) {
MachineFunction &MF = B.getMF();
assert(MI.getNumMemOperands() == 1);
@@ -350,9 +371,10 @@ void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
B.buildMergeLikeInstr(Dst, MergeTyParts);
}
MI.eraseFromParent();
+ return true;
}
-void RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &MI) const {
+bool RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &MI) const {
Register Dst = MI.getDstReg();
Register Ptr = MI.getPointerReg();
MachineMemOperand &MMO = MI.getMMO();
@@ -376,9 +398,10 @@ void RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &MI) const {
}
MI.eraseFromParent();
+ return true;
}
-void RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) {
+bool RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) {
Register Dst = MI.getOperand(0).getReg();
LLT Ty = MRI.getType(Dst);
Register Src = MI.getOperand(1).getReg();
@@ -404,15 +427,22 @@ void RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) {
Hi = B.buildUndef({VgprRB_S32});
break;
default:
- llvm_unreachable("Opcode not supported");
+ reportGISelFailure(
+ MF, MORE, "amdgpu-regbanklegalize",
+ "AMDGPU RegBankLegalize: lowerVccExtToSel, Opcode not supported", MI);
+ return false;
}
B.buildMergeValues(Dst, {Lo.getReg(0), Hi.getReg(0)});
} else {
- llvm_unreachable("Type not supported");
+ reportGISelFailure(
+ MF, MORE, "amdgpu-regbanklegalize",
+ "AMDGPU RegBankLegalize: lowerVccExtToSel, Type not supported", MI);
+ return false;
}
MI.eraseFromParent();
+ return true;
}
std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(Register Reg) {
@@ -437,7 +467,14 @@ std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(Register Reg) {
return {Lo.getReg(0), Hi.getReg(0)};
}
-void RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) {
+std::pair<Register, Register>
+RegBankLegalizeHelper::unpackAExtTruncS16(Register Reg) {
+ auto [Lo32, Hi32] = unpackAExt(Reg);
+ return {B.buildTrunc(SgprRB_S16, Lo32).getReg(0),
+ B.buildTrunc(SgprRB_S16, Hi32).getReg(0)};
+}
+
+bool RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) {
Register Lo, Hi;
switch (MI.getOpcode()) {
case AMDGPU::G_SHL: {
@@ -462,13 +499,18 @@ void RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) {
break;
}
default:
- llvm_unreachable("Unpack lowering not implemented");
+ reportGISelFailure(
+ MF, MORE, "amdgpu-regbanklegalize",
+ "AMDGPU RegBankLegalize: lowerUnpackBitShift, case not implemented",
+ MI);
+ return false;
}
B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
MI.eraseFromParent();
+ return true;
}
-void RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) {
+bool RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) {
Register Lo, Hi;
switch (MI.getOpcode()) {
case AMDGPU::G_SMIN:
@@ -494,10 +536,25 @@ void RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) {
break;
}
default:
- llvm_unreachable("Unpack min/max lowering not implemented");
+ reportGISelFailure(
+ MF, MORE, "amdgpu-regbanklegalize",
+ "AMDGPU RegBankLegalize: lowerUnpackMinMax, case not implemented", MI);
+ return false;
}
B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
MI.eraseFromParent();
+ return true;
+}
+
+bool RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &MI) {
+ auto [Op1Lo, Op1Hi] = unpackAExt(MI.getOperand(1).getReg());
+ auto [Op2Lo, Op2Hi] = unpackAExt(MI.getOperand(2).getReg());
+ auto ResLo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Lo, Op2Lo});
+ auto ResHi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Hi, Op2Hi});
+ B.buildBuildVectorTrunc(MI.getOperand(0).getReg(),
+ {ResLo.getReg(0), ResHi.getReg(0)});
+ MI.eraseFromParent();
+ return true;
}
static bool isSignedBFE(MachineInstr &MI) {
@@ -507,7 +564,7 @@ static bool isSignedBFE(MachineInstr &MI) {
return MI.getOpcode() == AMDGPU::G_SBFX;
}
-void RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) {
+bool RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) {
Register Dst = MI.getOperand(0).getReg();
assert(MRI.getType(Dst) == LLT::scalar(64));
bool Signed = isSignedBFE(MI);
@@ -534,7 +591,7 @@ void RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) {
auto SignBit = B.buildShl({VgprRB, S64}, SHRSrc, Amt);
B.buildInstr(SHROpc, {Dst}, {SignBit, Amt});
MI.eraseFromParent();
- return;
+ return true;
}
uint64_t WidthImm = ConstWidth->Value.getZExtValue();
@@ -564,9 +621,10 @@ void RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) {
}
MI.eraseFromParent();
+ return true;
}
-void RegBankLegalizeHelper::lowerS_BFE(MachineInstr &MI) {
+bool RegBankLegalizeHelper::lowerS_BFE(MachineInstr &MI) {
Register DstReg = MI.getOperand(0).getReg();
LLT Ty = MRI.getType(DstReg);
bool Signed = isSignedBFE(MI);
@@ -592,14 +650,19 @@ void RegBankLegalizeHelper::lowerS_BFE(MachineInstr &MI) {
auto S_BFE = B.buildInstr(Opc, {{SgprRB, Ty}},
{B.buildCopy(Ty, Src), B.buildCopy(S32, Src1)});
if (!constrainSelectedInstRegOperands(*S_BFE, *ST.getInstrInfo(),
- *ST.getRegisterInfo(), RBI))
- llvm_unreachable("failed to constrain BFE");
+ *ST.getRegisterInfo(), RBI)) {
+ reportGISelFailure(
+ MF, MORE, "amdgpu-regbanklegalize",
+ "AMDGPU RegBankLegalize: lowerS_BFE, failed to constrain BFE", MI);
+ return false;
+ }
B.buildCopy(DstReg, S_BFE->getOperand(0).getReg());
MI.eraseFromParent();
+ return true;
}
-void RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
+bool RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
Register Dst = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(Dst);
assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64);
@@ -614,9 +677,35 @@ void RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(1), Op2.getReg(1)}, Flags);
B.buildMergeLikeInstr(Dst, {Lo, Hi});
MI.eraseFromParent();
+ return true;
+}
+
+bool RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) {
+ Register Dst = MI.getOperand(0).getReg();
+ assert(MRI.getType(Dst) == V2S16);
+ unsigned Opc = MI.getOpcode();
+ auto Flags = MI.getFlags();
+
+ if (MI.getNumOperands() == 2) {
+ auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(MI.getOperand(1).getReg());
+ auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo}, Flags);
+ auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi}, Flags);
+ B.buildMergeLikeInstr(Dst, {Lo, Hi});
+ MI.eraseFromParent();
+ return true;
+ }
+
+ assert(MI.getNumOperands() == 3);
+ auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(MI.getOperand(1).getReg());
+ auto [Op2Lo, Op2Hi] = unpackAExtTruncS16(MI.getOperand(2).getReg());
+ auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo}, Flags);
+ auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi}, Flags);
+ B.buildMergeLikeInstr(Dst, {Lo, Hi});
+ MI.eraseFromParent();
+ return true;
}
-void RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) {
+bool RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) {
Register Dst = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(Dst);
assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64 ||
@@ -633,9 +722,10 @@ void RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) {
B.buildMergeLikeInstr(Dst, {Lo, Hi});
MI.eraseFromParent();
+ return true;
}
-void RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &MI) {
+bool RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &MI) {
auto Op1 = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg());
int Amt = MI.getOperand(2).getImm();
Register Lo, Hi;
@@ -660,9 +750,10 @@ void RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &MI) {
B.buildMergeLikeInstr(MI.getOperand(0).getReg(), {Lo, Hi});
MI.eraseFromParent();
+ return true;
}
-void RegBankLegalizeHelper::lower(MachineInstr &MI,
+bool RegBankLegalizeHelper::lower(MachineInstr &MI,
const RegBankLLTMapping &Mapping,
SmallSet<Register, 4> &WaterfallSgprs) {
@@ -682,12 +773,14 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
B.buildSelect(MI.getOperand(0).getReg(), MI.getOperand(1).getReg(), True,
False);
MI.eraseFromParent();
- return;
+ return true;
}
case UnpackBitShift:
return lowerUnpackBitShift(MI);
case UnpackMinMax:
return lowerUnpackMinMax(MI);
+ case ScalarizeToS16:
+ return lowerSplitTo16(MI);
case Ext32To64: {
const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
MachineInstrBuilder Hi;
@@ -707,20 +800,23 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
break;
}
default:
- llvm_unreachable("Unsuported Opcode in Ext32To64");
+ reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
+ "AMDGPU RegBankLegalize: Ext32To64, unsuported opcode",
+ MI);
+ return false;
}
B.buildMergeLikeInstr(MI.getOperand(0).getReg(),
{MI.getOperand(1).getReg(), Hi});
MI.eraseFromParent();
- return;
+ return true;
}
case UniCstExt: {
uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue();
B.buildConstant(MI.getOperand(0).getReg(), ConstVal);
MI.eraseFromParent();
- return;
+ return true;
}
case VgprToVccCopy: {
Register Src = MI.getOperand(1).getReg();
@@ -744,7 +840,7 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
auto Zero = B.buildConstant({VgprRB, Ty}, 0);
B.buildICmp(CmpInst::ICMP_NE, MI.getOperand(0).getReg(), BoolSrc, Zero);
MI.eraseFromParent();
- return;
+ return true;
}
case V_BFE:
return lowerV_BFE(MI);
@@ -773,8 +869,10 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
else if (Size / 128 == 4)
splitLoad(MI, {B128, B128, B128, B128});
else {
- LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
- llvm_unreachable("SplitLoad type not supported for MI");
+ reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
+ "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
+ MI);
+ return false;
}
}
// 64 and 32 bit load
@@ -785,10 +883,12 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
else if (DstTy == V6S16)
splitLoad(MI, {V4S16, V2S16}, V2S16);
else {
- LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
- llvm_unreachable("SplitLoad type not supported for MI");
+ reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
+ "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
+ MI);
+ return false;
}
- break;
+ return true;
}
case WidenLoad: {
LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
@@ -799,19 +899,25 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
else if (DstTy == V6S16)
widenLoad(MI, V8S16, V2S16);
else {
- LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
- llvm_unreachable("WidenLoad type not supported for MI");
+ reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
+ "AMDGPU RegBankLegalize: WidenLoad, unsuported type",
+ MI);
+ return false;
}
- break;
+ return true;
}
+ case UnpackAExt:
+ return lowerUnpackAExt(MI);
case WidenMMOToS32:
return widenMMOToS32(cast<GAnyLoad>(MI));
}
if (!WaterfallSgprs.empty()) {
MachineBasicBlock::iterator I = MI.getIterator();
- executeInWaterfallLoop(B, make_range(I, std::next(I)), WaterfallSgprs);
+ if (!executeInWaterfallLoop(B, make_range(I, std::next(I)), WaterfallSgprs))
+ return false;
}
+ return true;
}
LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
@@ -837,10 +943,12 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
return LLT::scalar(32);
case Sgpr64:
case Vgpr64:
+ case UniInVgprS64:
return LLT::scalar(64);
case Sgpr128:
case Vgpr128:
return LLT::scalar(128);
+ case SgprP0:
case VgprP0:
return LLT::pointer(0, 64);
case SgprP1:
@@ -855,6 +963,8 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
case SgprP5:
case VgprP5:
return LLT::pointer(5, 32);
+ case SgprP8:
+ return LLT::pointer(8, 128);
case SgprV2S16:
case VgprV2S16:
case UniInVgprV2S16:
@@ -940,10 +1050,12 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
case Sgpr32_WF:
case Sgpr64:
case Sgpr128:
+ case SgprP0:
case SgprP1:
case SgprP3:
case SgprP4:
case SgprP5:
+ case SgprP8:
case SgprPtr32:
case SgprPtr64:
case SgprPtr128:
@@ -960,6 +1072,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
case UniInVcc:
case UniInVgprS16:
case UniInVgprS32:
+ case UniInVgprS64:
case UniInVgprV2S16:
case UniInVgprV4S32:
case UniInVgprB32:
@@ -1003,7 +1116,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
}
}
-void RegBankLegalizeHelper::applyMappingDst(
+bool RegBankLegalizeHelper::applyMappingDst(
MachineInstr &MI, unsigned &OpIdx,
const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs) {
// Defs start from operand 0
@@ -1022,10 +1135,12 @@ void RegBankLegalizeHelper::applyMappingDst(
case Sgpr32:
case Sgpr64:
case Sgpr128:
+ case SgprP0:
case SgprP1:
case SgprP3:
case SgprP4:
case SgprP5:
+ case SgprP8:
case SgprV2S16:
case SgprV2S32:
case SgprV4S32:
@@ -1092,6 +1207,7 @@ void RegBankLegalizeHelper::applyMappingDst(
break;
}
case UniInVgprS32:
+ case UniInVgprS64:
case UniInVgprV2S16:
case UniInVgprV4S32: {
assert(Ty == getTyFromID(MethodIDs[OpIdx]));
@@ -1120,20 +1236,28 @@ void RegBankLegalizeHelper::applyMappingDst(
assert(RB == SgprRB);
Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
Op.setReg(NewDst);
- B.buildTrunc(Reg, NewDst);
+ if (!MRI.use_empty(Reg))
+ B.buildTrunc(Reg, NewDst);
break;
}
case InvalidMapping: {
- LLVM_DEBUG(dbgs() << "Instruction with Invalid mapping: "; MI.dump(););
- llvm_unreachable("missing fast rule for MI");
+ reportGISelFailure(
+ MF, MORE, "amdgpu-regbanklegalize",
+ "AMDGPU RegBankLegalize: missing fast rule ('Div' or 'Uni') for", MI);
+ return false;
}
default:
- llvm_unreachable("ID not supported");
+ reportGISelFailure(
+ MF, MORE, "amdgpu-regbanklegalize",
+ "AMDGPU RegBankLegalize: applyMappingDst, ID not supported", MI);
+ return false;
}
}
+
+ return true;
}
-void RegBankLegalizeHelper::applyMappingSrc(
+bool RegBankLegalizeHelper::applyMappingSrc(
MachineInstr &MI, unsigned &OpIdx,
const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs,
SmallSet<Register, 4> &SgprWaterfallOperandRegs) {
@@ -1163,10 +1287,12 @@ void RegBankLegalizeHelper::applyMappingSrc(
case Sgpr32:
case Sgpr64:
case Sgpr128:
+ case SgprP0:
case SgprP1:
case SgprP3:
case SgprP4:
case SgprP5:
+ case SgprP8:
case SgprV2S16:
case SgprV2S32:
case SgprV4S32: {
@@ -1285,12 +1411,16 @@ void RegBankLegalizeHelper::applyMappingSrc(
break;
}
default:
- llvm_unreachable("ID not supported");
+ reportGISelFailure(
+ MF, MORE, "amdgpu-regbanklegalize",
+ "AMDGPU RegBankLegalize: applyMappingSrc, ID not supported", MI);
+ return false;
}
}
+ return true;
}
-void RegBankLegalizeHelper::applyMappingPHI(MachineInstr &MI) {
+bool RegBankLegalizeHelper::applyMappingPHI(MachineInstr &MI) {
Register Dst = MI.getOperand(0).getReg();
LLT Ty = MRI.getType(Dst);
@@ -1313,16 +1443,17 @@ void RegBankLegalizeHelper::applyMappingPHI(MachineInstr &MI) {
MI.getOperand(i).setReg(NewUse.getReg(0));
}
- return;
+ return true;
}
- // ALL divergent i1 phis should be already lowered and inst-selected into PHI
- // with sgpr reg class and S1 LLT.
+ // ALL divergent i1 phis should have been lowered and inst-selected into PHI
+ // with sgpr reg class and S1 LLT in AMDGPUGlobalISelDivergenceLowering pass.
// Note: this includes divergent phis that don't require lowering.
if (Ty == LLT::scalar(1) && MUI.isDivergent(Dst)) {
- LLVM_DEBUG(dbgs() << "Divergent S1 G_PHI: "; MI.dump(););
- llvm_unreachable("Make sure to run AMDGPUGlobalISelDivergenceLowering "
- "before RegBankLegalize to lower lane mask(vcc) phis");
+ reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
+ "AMDGPU RegBankLegalize: Can't lower divergent S1 G_PHI",
+ MI);
+ return false;
}
// We accept all types that can fit in some register class.
@@ -1330,11 +1461,13 @@ void RegBankLegalizeHelper::applyMappingPHI(MachineInstr &MI) {
// Divergent G_PHIs have vgpr dst but inputs can be sgpr or vgpr.
if (Ty == LLT::scalar(32) || Ty == LLT::pointer(1, 64) ||
Ty == LLT::pointer(4, 64)) {
- return;
+ return true;
}
- LLVM_DEBUG(dbgs() << "G_PHI not handled: "; MI.dump(););
- llvm_unreachable("type not supported");
+ reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
+ "AMDGPU RegBankLegalize: type not supported for G_PHI",
+ MI);
+ return false;
}
[[maybe_unused]] static bool verifyRegBankOnOperands(MachineInstr &MI,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
index d937815..1dc0278 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
@@ -12,6 +12,7 @@
#include "AMDGPURegBankLegalizeRules.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
namespace llvm {
@@ -27,11 +28,13 @@ namespace AMDGPU {
// to replace instruction. In other case InstApplyMethod will create new
// instruction(s).
class RegBankLegalizeHelper {
+ MachineFunction &MF;
const GCNSubtarget &ST;
MachineIRBuilder &B;
MachineRegisterInfo &MRI;
const MachineUniformityInfo &MUI;
const RegisterBankInfo &RBI;
+ MachineOptimizationRemarkEmitter MORE;
const RegBankLegalizeRules &RBLRules;
const bool IsWave32;
const RegisterBank *SgprRB;
@@ -72,6 +75,7 @@ class RegBankLegalizeHelper {
static constexpr LLT P6 = LLT::pointer(6, 32);
MachineRegisterInfo::VRegAttrs SgprRB_S32 = {SgprRB, S32};
+ MachineRegisterInfo::VRegAttrs SgprRB_S16 = {SgprRB, S16};
MachineRegisterInfo::VRegAttrs VgprRB_S32 = {VgprRB, S32};
MachineRegisterInfo::VRegAttrs VccRB_S1 = {VccRB, S1};
@@ -80,10 +84,10 @@ public:
const RegisterBankInfo &RBI,
const RegBankLegalizeRules &RBLRules);
- void findRuleAndApplyMapping(MachineInstr &MI);
+ bool findRuleAndApplyMapping(MachineInstr &MI);
// Manual apply helpers.
- void applyMappingPHI(MachineInstr &MI);
+ bool applyMappingPHI(MachineInstr &MI);
void applyMappingTrivial(MachineInstr &MI);
private:
@@ -96,34 +100,37 @@ private:
const RegisterBank *getRegBankFromID(RegBankLLTMappingApplyID ID);
- void
+ bool
applyMappingDst(MachineInstr &MI, unsigned &OpIdx,
const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs);
- void
+ bool
applyMappingSrc(MachineInstr &MI, unsigned &OpIdx,
const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs,
SmallSet<Register, 4> &SgprWaterfallOperandRegs);
- void splitLoad(MachineInstr &MI, ArrayRef<LLT> LLTBreakdown,
+ bool splitLoad(MachineInstr &MI, ArrayRef<LLT> LLTBreakdown,
LLT MergeTy = LLT());
- void widenLoad(MachineInstr &MI, LLT WideTy, LLT MergeTy = LLT());
- void widenMMOToS32(GAnyLoad &MI) const;
+ bool widenLoad(MachineInstr &MI, LLT WideTy, LLT MergeTy = LLT());
+ bool widenMMOToS32(GAnyLoad &MI) const;
- void lower(MachineInstr &MI, const RegBankLLTMapping &Mapping,
+ bool lower(MachineInstr &MI, const RegBankLLTMapping &Mapping,
SmallSet<Register, 4> &SgprWaterfallOperandRegs);
- void lowerVccExtToSel(MachineInstr &MI);
+ bool lowerVccExtToSel(MachineInstr &MI);
std::pair<Register, Register> unpackZExt(Register Reg);
std::pair<Register, Register> unpackSExt(Register Reg);
std::pair<Register, Register> unpackAExt(Register Reg);
- void lowerUnpackBitShift(MachineInstr &MI);
- void lowerV_BFE(MachineInstr &MI);
- void lowerS_BFE(MachineInstr &MI);
- void lowerSplitTo32(MachineInstr &MI);
- void lowerSplitTo32Select(MachineInstr &MI);
- void lowerSplitTo32SExtInReg(MachineInstr &MI);
- void lowerUnpackMinMax(MachineInstr &MI);
+ std::pair<Register, Register> unpackAExtTruncS16(Register Reg);
+ bool lowerUnpackBitShift(MachineInstr &MI);
+ bool lowerV_BFE(MachineInstr &MI);
+ bool lowerS_BFE(MachineInstr &MI);
+ bool lowerSplitTo32(MachineInstr &MI);
+ bool lowerSplitTo16(MachineInstr &MI);
+ bool lowerSplitTo32Select(MachineInstr &MI);
+ bool lowerSplitTo32SExtInReg(MachineInstr &MI);
+ bool lowerUnpackMinMax(MachineInstr &MI);
+ bool lowerUnpackAExt(MachineInstr &MI);
};
} // end namespace AMDGPU
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index a67b12a..9de3092 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -66,6 +66,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
return MRI.getType(Reg) == LLT::pointer(4, 64);
case P5:
return MRI.getType(Reg) == LLT::pointer(5, 32);
+ case P8:
+ return MRI.getType(Reg) == LLT::pointer(8, 128);
case Ptr32:
return isAnyPtr(MRI.getType(Reg), 32);
case Ptr64:
@@ -108,6 +110,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isUniform(Reg);
case UniP5:
return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isUniform(Reg);
+ case UniP8:
+ return MRI.getType(Reg) == LLT::pointer(8, 128) && MUI.isUniform(Reg);
case UniPtr32:
return isAnyPtr(MRI.getType(Reg), 32) && MUI.isUniform(Reg);
case UniPtr64:
@@ -202,7 +206,7 @@ bool PredicateMapping::match(const MachineInstr &MI,
return true;
}
-SetOfRulesForOpcode::SetOfRulesForOpcode() {}
+SetOfRulesForOpcode::SetOfRulesForOpcode() = default;
SetOfRulesForOpcode::SetOfRulesForOpcode(FastRulesTypes FastTypes)
: FastTypes(FastTypes) {}
@@ -239,7 +243,7 @@ UniformityLLTOpPredicateID LLTToBId(LLT Ty) {
return _;
}
-const RegBankLLTMapping &
+const RegBankLLTMapping *
SetOfRulesForOpcode::findMappingForMI(const MachineInstr &MI,
const MachineRegisterInfo &MRI,
const MachineUniformityInfo &MUI) const {
@@ -256,17 +260,16 @@ SetOfRulesForOpcode::findMappingForMI(const MachineInstr &MI,
Slot = getFastPredicateSlot(LLTToId(MRI.getType(Reg)));
if (Slot != -1)
- return MUI.isUniform(Reg) ? Uni[Slot] : Div[Slot];
+ return MUI.isUniform(Reg) ? &Uni[Slot] : &Div[Slot];
}
// Slow search for more complex rules.
for (const RegBankLegalizeRule &Rule : Rules) {
if (Rule.Predicate.match(MI, MUI, MRI))
- return Rule.OperandMapping;
+ return &Rule.OperandMapping;
}
- LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
- llvm_unreachable("None of the rules defined for MI's opcode matched MI");
+ return nullptr;
}
void SetOfRulesForOpcode::addRule(RegBankLegalizeRule Rule) {
@@ -349,7 +352,7 @@ RegBankLegalizeRules::addRulesForIOpcs(std::initializer_list<unsigned> OpcList,
return RuleSetInitializer(OpcList, IRulesAlias, IRules, FastTypes);
}
-const SetOfRulesForOpcode &
+const SetOfRulesForOpcode *
RegBankLegalizeRules::getRulesForOpc(MachineInstr &MI) const {
unsigned Opc = MI.getOpcode();
if (Opc == AMDGPU::G_INTRINSIC || Opc == AMDGPU::G_INTRINSIC_CONVERGENT ||
@@ -357,19 +360,15 @@ RegBankLegalizeRules::getRulesForOpc(MachineInstr &MI) const {
Opc == AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS) {
unsigned IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
auto IRAIt = IRulesAlias.find(IntrID);
- if (IRAIt == IRulesAlias.end()) {
- LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
- llvm_unreachable("No rules defined for intrinsic opcode");
- }
- return IRules.at(IRAIt->second);
+ if (IRAIt == IRulesAlias.end())
+ return nullptr;
+ return &IRules.at(IRAIt->second);
}
auto GRAIt = GRulesAlias.find(Opc);
- if (GRAIt == GRulesAlias.end()) {
- LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
- llvm_unreachable("No rules defined for generic opcode");
- }
- return GRules.at(GRAIt->second);
+ if (GRAIt == GRulesAlias.end())
+ return nullptr;
+ return &GRules.at(GRAIt->second);
}
// Syntactic sugar wrapper for predicate lambda that enables '&&', '||' and '!'.
@@ -470,7 +469,19 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Uni(S16, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32AExt}})
.Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
.Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
- .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}});
+ .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
+ .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, UnpackAExt})
+ .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
+ .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr64}})
+ .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}});
+
+ addRulesForGOpcs({G_UADDO, G_USUBO}, Standard)
+ .Uni(S32, {{Sgpr32, Sgpr32Trunc}, {Sgpr32, Sgpr32}})
+ .Div(S32, {{Vgpr32, Vcc}, {Vgpr32, Vgpr32}});
+
+ addRulesForGOpcs({G_UADDE, G_USUBE}, Standard)
+ .Uni(S32, {{Sgpr32, Sgpr32Trunc}, {Sgpr32, Sgpr32, Sgpr32AExtBoolInReg}})
+ .Div(S32, {{Vgpr32, Vcc}, {Vgpr32, Vgpr32, Vcc}});
addRulesForGOpcs({G_MUL}, Standard).Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}});
@@ -514,6 +525,10 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
.Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
+ addRulesForGOpcs({G_FSHR}, Standard)
+ .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}})
+ .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}});
+
addRulesForGOpcs({G_FRAME_INDEX}).Any({{UniP5, _}, {{SgprP5}, {None}}});
addRulesForGOpcs({G_UBFX, G_SBFX}, Standard)
@@ -901,14 +916,56 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
addRulesForGOpcs({G_ABS}, Standard).Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt}});
- addRulesForGOpcs({G_READSTEADYCOUNTER}, Standard).Uni(S64, {{Sgpr64}, {}});
+ addRulesForGOpcs({G_FENCE}).Any({{{}}, {{}, {}}});
+
+ addRulesForGOpcs({G_READSTEADYCOUNTER, G_READCYCLECOUNTER}, Standard)
+ .Uni(S64, {{Sgpr64}, {}});
+
+ addRulesForGOpcs({G_BLOCK_ADDR}).Any({{UniP0}, {{SgprP0}, {}}});
+
+ addRulesForGOpcs({G_GLOBAL_VALUE})
+ .Any({{UniP0}, {{SgprP0}, {}}})
+ .Any({{UniP1}, {{SgprP1}, {}}})
+ .Any({{UniP3}, {{SgprP3}, {}}})
+ .Any({{UniP4}, {{SgprP4}, {}}})
+ .Any({{UniP8}, {{SgprP8}, {}}});
+
+ addRulesForGOpcs({G_AMDGPU_WAVE_ADDRESS}).Any({{UniP5}, {{SgprP5}, {}}});
bool hasSALUFloat = ST->hasSALUFloatInsts();
- addRulesForGOpcs({G_FADD}, Standard)
+ addRulesForGOpcs({G_FADD, G_FMUL, G_STRICT_FADD, G_STRICT_FMUL}, Standard)
+ .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUFloat)
+ .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUFloat)
+ .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
.Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat)
.Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat)
- .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}});
+ .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
+ .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}})
+ .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}})
+ .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16}}, !hasSALUFloat)
+ .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, ScalarizeToS16},
+ hasSALUFloat)
+ .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}});
+
+ // FNEG and FABS are either folded as source modifiers or can be selected as
+ // bitwise XOR and AND with Mask. XOR and AND are available on SALU but for
+ // targets without SALU float we still select them as VGPR since there would
+ // be no real sgpr use.
+ addRulesForGOpcs({G_FNEG, G_FABS}, Standard)
+ .Uni(S16, {{UniInVgprS16}, {Vgpr16}}, !hasSALUFloat)
+ .Uni(S16, {{Sgpr16}, {Sgpr16}}, hasSALUFloat)
+ .Div(S16, {{Vgpr16}, {Vgpr16}})
+ .Uni(S32, {{UniInVgprS32}, {Vgpr32}}, !hasSALUFloat)
+ .Uni(S32, {{Sgpr32}, {Sgpr32}}, hasSALUFloat)
+ .Div(S32, {{Vgpr32}, {Vgpr32}})
+ .Uni(S64, {{UniInVgprS64}, {Vgpr64}})
+ .Div(S64, {{Vgpr64}, {Vgpr64}})
+ .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16}}, !hasSALUFloat)
+ .Uni(V2S16, {{SgprV2S16}, {SgprV2S16}, ScalarizeToS16}, hasSALUFloat)
+ .Div(V2S16, {{VgprV2S16}, {VgprV2S16}})
+ .Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32}}})
+ .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32}}});
addRulesForGOpcs({G_FPTOUI})
.Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat)
@@ -919,6 +976,14 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat)
.Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat);
+ addRulesForGOpcs({G_IS_FPCLASS})
+ .Any({{DivS1, S16}, {{Vcc}, {Vgpr16}}})
+ .Any({{UniS1, S16}, {{UniInVcc}, {Vgpr16}}})
+ .Any({{DivS1, S32}, {{Vcc}, {Vgpr32}}})
+ .Any({{UniS1, S32}, {{UniInVcc}, {Vgpr32}}})
+ .Any({{DivS1, S64}, {{Vcc}, {Vgpr64}}})
+ .Any({{UniS1, S64}, {{UniInVcc}, {Vgpr64}}});
+
using namespace Intrinsic;
addRulesForIOpcs({amdgcn_s_getpc}).Any({{UniS64, _}, {{Sgpr64}, {None}}});
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
index 93e0efd..1ac1173 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
@@ -63,6 +63,7 @@ enum UniformityLLTOpPredicateID {
P3,
P4,
P5,
+ P8,
Ptr32,
Ptr64,
Ptr128,
@@ -72,6 +73,7 @@ enum UniformityLLTOpPredicateID {
UniP3,
UniP4,
UniP5,
+ UniP8,
UniPtr32,
UniPtr64,
UniPtr128,
@@ -92,8 +94,10 @@ enum UniformityLLTOpPredicateID {
V4S32,
UniV2S16,
+ UniV2S32,
DivV2S16,
+ DivV2S32,
// B types
B32,
@@ -134,10 +138,12 @@ enum RegBankLLTMappingApplyID {
Sgpr32,
Sgpr64,
Sgpr128,
+ SgprP0,
SgprP1,
SgprP3,
SgprP4,
SgprP5,
+ SgprP8,
SgprPtr32,
SgprPtr64,
SgprPtr128,
@@ -178,7 +184,9 @@ enum RegBankLLTMappingApplyID {
UniInVcc,
UniInVgprS16,
UniInVgprS32,
+ UniInVgprS64,
UniInVgprV2S16,
+ UniInVgprV2S32,
UniInVgprV4S32,
UniInVgprB32,
UniInVgprB64,
@@ -217,13 +225,15 @@ enum LoweringMethodID {
V_BFE,
VgprToVccCopy,
SplitTo32,
+ ScalarizeToS16,
SplitTo32Select,
SplitTo32SExtInReg,
Ext32To64,
UniCstExt,
SplitLoad,
WidenLoad,
- WidenMMOToS32
+ WidenMMOToS32,
+ UnpackAExt
};
enum FastRulesTypes {
@@ -277,7 +287,7 @@ public:
SetOfRulesForOpcode();
SetOfRulesForOpcode(FastRulesTypes FastTypes);
- const RegBankLLTMapping &
+ const RegBankLLTMapping *
findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI,
const MachineUniformityInfo &MUI) const;
@@ -375,7 +385,7 @@ public:
MRI = &_MRI;
};
- const SetOfRulesForOpcode &getRulesForOpc(MachineInstr &MI) const;
+ const SetOfRulesForOpcode *getRulesForOpc(MachineInstr &MI) const;
};
} // end namespace AMDGPU
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 54ba2f8..ce4cc79 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -287,9 +287,6 @@ unsigned AMDGPURegisterBankInfo::getBreakDownCost(
const RegisterBank &
AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
LLT Ty) const {
- if (&RC == &AMDGPU::SReg_1RegClass)
- return AMDGPU::VCCRegBank;
-
// We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
// VCC-like use.
if (TRI->isSGPRClass(&RC)) {
@@ -471,7 +468,7 @@ RegisterBankInfo::InstructionMappings
AMDGPURegisterBankInfo::getInstrAlternativeMappings(
const MachineInstr &MI) const {
- const MachineFunction &MF = *MI.getParent()->getParent();
+ const MachineFunction &MF = *MI.getMF();
const MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -2412,7 +2409,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
if (DstBank == &AMDGPU::VCCRegBank)
break;
- MachineFunction *MF = MI.getParent()->getParent();
+ MachineFunction *MF = MI.getMF();
ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
LegalizerHelper Helper(*MF, ApplyBank, B);
@@ -2492,7 +2489,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
// There is no VALU abs instruction so we need to replace it with a sub and
// max combination.
if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) {
- MachineFunction *MF = MI.getParent()->getParent();
+ MachineFunction *MF = MI.getMF();
ApplyRegBankMapping Apply(B, *this, MRI, &AMDGPU::VGPRRegBank);
LegalizerHelper Helper(*MF, Apply, B);
@@ -3114,6 +3111,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
@@ -3607,7 +3606,7 @@ unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI,
}
bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
- const MachineFunction &MF = *MI.getParent()->getParent();
+ const MachineFunction &MF = *MI.getMF();
const MachineRegisterInfo &MRI = MF.getRegInfo();
for (const MachineOperand &MO : MI.operands()) {
if (!MO.isReg())
@@ -3623,7 +3622,7 @@ bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
const RegisterBankInfo::InstructionMapping &
AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
- const MachineFunction &MF = *MI.getParent()->getParent();
+ const MachineFunction &MF = *MI.getMF();
const MachineRegisterInfo &MRI = MF.getRegInfo();
SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
@@ -3641,7 +3640,7 @@ AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
const RegisterBankInfo::InstructionMapping &
AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
- const MachineFunction &MF = *MI.getParent()->getParent();
+ const MachineFunction &MF = *MI.getMF();
const MachineRegisterInfo &MRI = MF.getRegInfo();
SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
@@ -3665,7 +3664,7 @@ AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
const RegisterBankInfo::InstructionMapping &
AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
- const MachineFunction &MF = *MI.getParent()->getParent();
+ const MachineFunction &MF = *MI.getMF();
const MachineRegisterInfo &MRI = MF.getRegInfo();
SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
@@ -3744,7 +3743,7 @@ AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI,
const RegisterBankInfo::InstructionMapping &
AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
- const MachineFunction &MF = *MI.getParent()->getParent();
+ const MachineFunction &MF = *MI.getMF();
const MachineRegisterInfo &MRI = MF.getRegInfo();
SmallVector<const ValueMapping*, 2> OpdsMapping(2);
unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
@@ -3834,7 +3833,7 @@ AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
//
const RegisterBankInfo::InstructionMapping &
AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
- const MachineFunction &MF = *MI.getParent()->getParent();
+ const MachineFunction &MF = *MI.getMF();
const MachineRegisterInfo &MRI = MF.getRegInfo();
if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) {
@@ -4502,6 +4501,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
@@ -5081,17 +5082,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
unsigned MinNumRegsRequired = DstSize / 32;
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ bool UseAGPRForm = Info->selectAGPRFormMFMA(MinNumRegsRequired);
+
OpdsMapping[0] =
- Info->getMinNumAGPRs() >= MinNumRegsRequired
- ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
- : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+ UseAGPRForm ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
+ : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
OpdsMapping[4] =
- Info->getMinNumAGPRs() >= MinNumRegsRequired
- ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
- : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+ UseAGPRForm ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
+ : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
OpdsMapping[8] = getVGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI);
OpdsMapping[10] = getVGPROpMapping(MI.getOperand(10).getReg(), MRI, *TRI);
@@ -5217,11 +5218,15 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
break;
}
case Intrinsic::amdgcn_wave_reduce_add:
+ case Intrinsic::amdgcn_wave_reduce_fadd:
case Intrinsic::amdgcn_wave_reduce_sub:
+ case Intrinsic::amdgcn_wave_reduce_fsub:
case Intrinsic::amdgcn_wave_reduce_min:
case Intrinsic::amdgcn_wave_reduce_umin:
+ case Intrinsic::amdgcn_wave_reduce_fmin:
case Intrinsic::amdgcn_wave_reduce_max:
case Intrinsic::amdgcn_wave_reduce_umax:
+ case Intrinsic::amdgcn_wave_reduce_fmax:
case Intrinsic::amdgcn_wave_reduce_and:
case Intrinsic::amdgcn_wave_reduce_or:
case Intrinsic::amdgcn_wave_reduce_xor: {
@@ -5304,12 +5309,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
break;
}
- case Intrinsic::amdgcn_global_atomic_csub:
case Intrinsic::amdgcn_global_atomic_fmin_num:
case Intrinsic::amdgcn_global_atomic_fmax_num:
case Intrinsic::amdgcn_flat_atomic_fmin_num:
case Intrinsic::amdgcn_flat_atomic_fmax_num:
- case Intrinsic::amdgcn_atomic_cond_sub_u32:
case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
case Intrinsic::amdgcn_global_load_tr_b64:
case Intrinsic::amdgcn_global_load_tr_b128:
@@ -5704,6 +5707,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_ATOMICRMW_FMAX:
case AMDGPU::G_ATOMICRMW_UINC_WRAP:
case AMDGPU::G_ATOMICRMW_UDEC_WRAP:
+ case AMDGPU::G_ATOMICRMW_USUB_COND:
+ case AMDGPU::G_ATOMICRMW_USUB_SAT:
case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: {
OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
index 1c1a6da..c37d309 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
@@ -15,7 +15,7 @@ def VGPRRegBank : RegisterBank<"VGPR",
>;
// It is helpful to distinguish conditions from ordinary SGPRs.
-def VCCRegBank : RegisterBank <"VCC", [SReg_1]>;
+def VCCRegBank : RegisterBank<"VCC", [SReg_32, SReg_64]>;
def AGPRRegBank : RegisterBank <"AGPR",
[AGPR_LO16, AGPR_32, AReg_64, AReg_96, AReg_128, AReg_160, AReg_192, AReg_224, AReg_256, AReg_288, AReg_320, AReg_352, AReg_384, AReg_512, AReg_1024]
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index 0ea9add..b03d50f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -261,13 +261,6 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage(
const Function *Callee = getCalleeFunction(*CalleeOp);
- // Avoid crashing on undefined behavior with an illegal call to a
- // kernel. If a callsite's calling convention doesn't match the
- // function's, it's undefined behavior. If the callsite calling
- // convention does match, that would have errored earlier.
- if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
- report_fatal_error("invalid call to entry function");
-
auto isSameFunction = [](const MachineFunction &MF, const Function *F) {
return F == &MF.getFunction();
};
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
index 89c16da..ffbb1c1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
@@ -32,6 +32,7 @@
#include "llvm/CodeGen/LiveStacks.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/SlotIndexes.h"
#include "llvm/CodeGen/VirtRegMap.h"
#include "llvm/InitializePasses.h"
@@ -96,8 +97,8 @@ public:
/// Compute the register class constraints based on the uses of \p Reg,
/// excluding MFMA uses from which can be rewritten to change the register
- /// class constraint. This should be nearly identical to
- /// MachineRegisterInfo::recomputeRegClass.
+ /// class constraint. MFMA scale operands need to be constraint checked.
+ /// This should be nearly identical to MachineRegisterInfo::recomputeRegClass.
/// \p RewriteCandidates will collect the set of MFMA instructions that need
/// to have the opcode mutated to perform the replacement.
@@ -151,9 +152,16 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable(
// We can swap the classes of dst + src2 as a pair to AGPR, so ignore the
// effects of rewrite candidates. It just so happens that we can use
- // either AGPR or VGPR in src0/src1, so don't bother checking the
- // constraint effects of the individual operands.
+ // either AGPR or VGPR in src0/src1. We still need to check constraint
+ // effects for scale variant, which does not allow AGPR.
if (isRewriteCandidate(*MI)) {
+ int AGPROp = AMDGPU::getMFMASrcCVDstAGPROp(MI->getOpcode());
+ const MCInstrDesc &AGPRDesc = TII.get(AGPROp);
+ const TargetRegisterClass *NewRC =
+ TII.getRegClass(AGPRDesc, MO.getOperandNo());
+ if (!TRI.hasAGPRs(NewRC))
+ return false;
+
const MachineOperand *VDst =
TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
const MachineOperand *Src2 =
@@ -659,7 +667,11 @@ AMDGPURewriteAGPRCopyMFMAPass::run(MachineFunction &MF,
if (!Impl.run(MF))
return PreservedAnalyses::all();
auto PA = getMachineFunctionPassPreservedAnalyses();
- PA.preserveSet<CFGAnalyses>();
- PA.preserve<LiveStacksAnalysis>();
+ PA.preserveSet<CFGAnalyses>()
+ .preserve<LiveStacksAnalysis>()
+ .preserve<VirtRegMapAnalysis>()
+ .preserve<SlotIndexesAnalysis>()
+ .preserve<LiveIntervalsAnalysis>()
+ .preserve<LiveRegMatrixAnalysis>();
return PA;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
index 4b1f80c..a2e16c7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
@@ -299,7 +299,7 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
if (Replacements.empty())
return false;
- LLVMContext &Ctx = F.getParent()->getContext();
+ LLVMContext &Ctx = F.getContext();
StructType *NewRetTy = StructType::create(Ctx, ReturnTypes, F.getName());
FunctionType *NewFuncTy = FunctionType::get(NewRetTy,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 2393346..58a9b55 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -237,8 +237,6 @@ def : SourceOfDivergence<int_amdgcn_mbcnt_lo>;
def : SourceOfDivergence<int_r600_read_tidig_x>;
def : SourceOfDivergence<int_r600_read_tidig_y>;
def : SourceOfDivergence<int_r600_read_tidig_z>;
-def : SourceOfDivergence<int_amdgcn_atomic_cond_sub_u32>;
-def : SourceOfDivergence<int_amdgcn_global_atomic_csub>;
def : SourceOfDivergence<int_amdgcn_global_atomic_fmin_num>;
def : SourceOfDivergence<int_amdgcn_global_atomic_fmax_num>;
def : SourceOfDivergence<int_amdgcn_global_atomic_ordered_add_b64>;
@@ -413,3 +411,12 @@ def : AlwaysUniform<int_amdgcn_s_getpc>;
def : AlwaysUniform<int_amdgcn_s_getreg>;
def : AlwaysUniform<int_amdgcn_s_memrealtime>;
def : AlwaysUniform<int_amdgcn_s_memtime>;
+
+def AMDGPUImageDMaskIntrinsicTable : GenericTable {
+ let FilterClass = "AMDGPUImageDMaskIntrinsic";
+ let Fields = ["Intr"];
+
+ let PrimaryKey = ["Intr"];
+ let PrimaryKeyName = "getAMDGPUImageDMaskIntrinsic";
+ let PrimaryKeyEarlyOut = 1;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.cpp
index 2941a48..5b8ee5f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.cpp
@@ -7,13 +7,53 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUSelectionDAGInfo.h"
-#include "AMDGPUISelLowering.h"
+
+#define GET_SDNODE_DESC
+#include "AMDGPUGenSDNodeInfo.inc"
using namespace llvm;
+AMDGPUSelectionDAGInfo::AMDGPUSelectionDAGInfo()
+ : SelectionDAGGenTargetInfo(AMDGPUGenSDNodeInfo) {}
+
AMDGPUSelectionDAGInfo::~AMDGPUSelectionDAGInfo() = default;
-bool AMDGPUSelectionDAGInfo::isTargetMemoryOpcode(unsigned Opcode) const {
- return Opcode >= AMDGPUISD::FIRST_MEMORY_OPCODE &&
- Opcode <= AMDGPUISD::LAST_MEMORY_OPCODE;
+const char *AMDGPUSelectionDAGInfo::getTargetNodeName(unsigned Opcode) const {
+#define NODE_NAME_CASE(node) \
+ case AMDGPUISD::node: \
+ return "AMDGPUISD::" #node;
+
+ switch (static_cast<AMDGPUISD::NodeType>(Opcode)) {
+ // These nodes don't have corresponding entries in *.td files yet.
+ NODE_NAME_CASE(WAVE_ADDRESS)
+ NODE_NAME_CASE(MAD_I64_I32)
+ NODE_NAME_CASE(MAD_U64_U32)
+ NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
+ // These do, but only when compiling R600.td,
+ // and the enum is generated from AMDGPU.td.
+ NODE_NAME_CASE(DOT4)
+ NODE_NAME_CASE(TEXTURE_FETCH)
+ NODE_NAME_CASE(R600_EXPORT)
+ NODE_NAME_CASE(CONST_ADDRESS)
+ NODE_NAME_CASE(DUMMY_CHAIN)
+ }
+
+#undef NODE_NAME_CASE
+
+ return SelectionDAGGenTargetInfo::getTargetNodeName(Opcode);
+}
+
+void AMDGPUSelectionDAGInfo::verifyTargetNode(const SelectionDAG &DAG,
+ const SDNode *N) const {
+ switch (N->getOpcode()) {
+ case AMDGPUISD::IF:
+ // result #0 must have type i1, but has type i32/i64
+ case AMDGPUISD::ELSE:
+ case AMDGPUISD::LOOP:
+ // operand #1 must have type i1, but has type i32/i64
+ case AMDGPUISD::LDS:
+ // result #0 must have type i64 (iPTR), but has type i32
+ return;
+ }
+ SelectionDAGGenTargetInfo::verifyTargetNode(DAG, N);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.h
index 3280be7..bae614a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.h
@@ -11,13 +11,49 @@
#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#define GET_SDNODE_ENUM
+#include "AMDGPUGenSDNodeInfo.inc"
+
namespace llvm {
+namespace AMDGPUISD {
+
+enum NodeType : unsigned {
+ // Convert a unswizzled wave uniform stack address to an address compatible
+ // with a vector offset for use in stack access.
+ WAVE_ADDRESS = GENERATED_OPCODE_END,
+
+ DOT4,
+ MAD_U64_U32,
+ MAD_I64_I32,
+ TEXTURE_FETCH,
+ R600_EXPORT,
+ CONST_ADDRESS,
-class AMDGPUSelectionDAGInfo : public SelectionDAGTargetInfo {
+ /// This node is for VLIW targets and it is used to represent a vector
+ /// that is stored in consecutive registers with the same channel.
+ /// For example:
+ /// |X |Y|Z|W|
+ /// T0|v.x| | | |
+ /// T1|v.y| | | |
+ /// T2|v.z| | | |
+ /// T3|v.w| | | |
+ BUILD_VERTICAL_VECTOR,
+
+ DUMMY_CHAIN,
+};
+
+} // namespace AMDGPUISD
+
+class AMDGPUSelectionDAGInfo : public SelectionDAGGenTargetInfo {
public:
+ AMDGPUSelectionDAGInfo();
+
~AMDGPUSelectionDAGInfo() override;
- bool isTargetMemoryOpcode(unsigned Opcode) const override;
+ const char *getTargetNodeName(unsigned Opcode) const override;
+
+ void verifyTargetNode(const SelectionDAG &DAG,
+ const SDNode *N) const override;
};
} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
index 9af8129..b707882 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
@@ -314,9 +314,7 @@ public:
#endif
bool empty() const { return Nodes.empty(); }
- const iterator_range<nodes_iterator> nodes() const {
- return {Nodes.begin(), Nodes.end()};
- }
+ iterator_range<nodes_iterator> nodes() const { return Nodes; }
const Node &getNode(unsigned ID) const { return *Nodes[ID]; }
unsigned getNumNodes() const { return Nodes.size(); }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 26e0b3df..5ca8ee2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -282,7 +282,7 @@ bool AMDGPUSubtarget::isSingleLaneExecution(const Function &Func) const {
}
bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
- Function *Kernel = I->getParent()->getParent();
+ Function *Kernel = I->getFunction();
unsigned MinSize = 0;
unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
bool IdQuery = false;
@@ -350,7 +350,7 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
}
unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
- assert(AMDGPU::isKernel(F.getCallingConv()));
+ assert(AMDGPU::isKernel(F));
// We don't allocate the segment if we know the implicit arguments weren't
// used, even if the ABI implies we need them.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
index 4a9437b..9bdaa42 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
@@ -271,7 +271,7 @@ void AMDGPUSwLowerLDS::getNonKernelsWithLDSArguments(const CallGraph &CG) {
Function *CalledFunc = CallerCGN->getFunction();
if (!CalledFunc || CalledFunc->isDeclaration())
continue;
- if (AMDGPU::isKernelLDS(CalledFunc))
+ if (AMDGPU::isKernel(*CalledFunc))
continue;
for (auto AI = CalledFunc->arg_begin(), E = CalledFunc->arg_end();
AI != E; ++AI) {
@@ -297,7 +297,7 @@ void AMDGPUSwLowerLDS::getUsesOfLDSByNonKernels() {
for (User *V : GV->users()) {
if (auto *I = dyn_cast<Instruction>(V)) {
Function *F = I->getFunction();
- if (!isKernelLDS(F) && !F->isDeclaration())
+ if (!isKernel(*F) && !F->isDeclaration())
FuncLDSAccessInfo.NonKernelToLDSAccessMap[F].insert(GV);
}
}
@@ -523,7 +523,7 @@ static void replacesUsesOfGlobalInFunction(Function *Func, GlobalVariable *GV,
auto ReplaceUsesLambda = [Func](const Use &U) -> bool {
auto *V = U.getUser();
if (auto *Inst = dyn_cast<Instruction>(V)) {
- auto *Func1 = Inst->getParent()->getParent();
+ auto *Func1 = Inst->getFunction();
if (Func == Func1)
return true;
}
@@ -1169,7 +1169,7 @@ bool AMDGPUSwLowerLDS::run() {
if (!F || K.second.empty())
continue;
- assert(isKernelLDS(F));
+ assert(isKernel(*F));
// Only inserts if key isn't already in the map.
FuncLDSAccessInfo.KernelToLDSParametersMap.insert(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 75a94ac..8a831f7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -17,6 +17,7 @@
#include "AMDGPUTargetMachine.h"
#include "AMDGPU.h"
#include "AMDGPUAliasAnalysis.h"
+#include "AMDGPUArgumentUsageInfo.h"
#include "AMDGPUBarrierLatency.h"
#include "AMDGPUCtorDtorLowering.h"
#include "AMDGPUExportClustering.h"
@@ -465,6 +466,11 @@ static cl::opt<bool> EnableScalarIRPasses(
cl::init(true),
cl::Hidden);
+static cl::opt<bool> EnableLowerExecSync(
+ "amdgpu-enable-lower-exec-sync",
+ cl::desc("Enable lowering of execution synchronization."), cl::init(true),
+ cl::Hidden);
+
static cl::opt<bool>
EnableSwLowerLDS("amdgpu-enable-sw-lower-lds",
cl::desc("Enable lowering of lds to global memory pass "
@@ -567,9 +573,10 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeSILoadStoreOptimizerLegacyPass(*PR);
initializeAMDGPUCtorDtorLoweringLegacyPass(*PR);
initializeAMDGPUAlwaysInlinePass(*PR);
+ initializeAMDGPULowerExecSyncLegacyPass(*PR);
initializeAMDGPUSwLowerLDSLegacyPass(*PR);
initializeAMDGPUAnnotateUniformValuesLegacyPass(*PR);
- initializeAMDGPUArgumentUsageInfoPass(*PR);
+ initializeAMDGPUArgumentUsageInfoWrapperLegacyPass(*PR);
initializeAMDGPUAtomicOptimizerPass(*PR);
initializeAMDGPULowerKernelArgumentsPass(*PR);
initializeAMDGPUPromoteKernelArgumentsPass(*PR);
@@ -641,7 +648,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
- DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation());
+ DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation(C->MF));
return DAG;
}
@@ -662,7 +669,7 @@ createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C) {
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
- DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation());
+ DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation(C->MF));
return DAG;
}
@@ -816,7 +823,7 @@ parseAMDGPUAtomicOptimizerStrategy(StringRef Params) {
Params.consume_front("strategy=");
auto Result = StringSwitch<std::optional<ScanOptions>>(Params)
.Case("dpp", ScanOptions::DPP)
- .Cases("iterative", "", ScanOptions::Iterative)
+ .Cases({"iterative", ""}, ScanOptions::Iterative)
.Case("none", ScanOptions::None)
.Default(std::nullopt);
if (Result)
@@ -962,6 +969,8 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
// We want to support the -lto-partitions=N option as "best effort".
// For that, we need to lower LDS earlier in the pipeline before the
// module is partitioned for codegen.
+ if (EnableLowerExecSync)
+ PM.addPass(AMDGPULowerExecSyncPass());
if (EnableSwLowerLDS)
PM.addPass(AMDGPUSwLowerLDSPass(*this));
if (EnableLowerModuleLDS)
@@ -1201,7 +1210,7 @@ GCNTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const {
EnableVOPD)
DAG->addMutation(createVOPDPairingMutation());
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
- DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation());
+ DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation(C->MF));
return DAG;
}
//===----------------------------------------------------------------------===//
@@ -1218,10 +1227,6 @@ class GCNPassConfig final : public AMDGPUPassConfig {
public:
GCNPassConfig(TargetMachine &TM, PassManagerBase &PM)
: AMDGPUPassConfig(TM, PM) {
- // It is necessary to know the register usage of the entire call graph. We
- // allow calls without EnableAMDGPUFunctionCalls if they are marked
- // noinline, so this is always required.
- setRequiresCodeGenSCCOrder(true);
substitutePass(&PostRASchedulerID, &PostMachineSchedulerID);
}
@@ -1315,6 +1320,9 @@ void AMDGPUPassConfig::addIRPasses() {
isPassEnabled(EnableImageIntrinsicOptimizer))
addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM));
+ if (EnableUniformIntrinsicCombine)
+ addPass(createAMDGPUUniformIntrinsicCombineLegacyPass());
+
// This can be disabled by passing ::Disable here or on the command line
// with --expand-variadics-override=disable.
addPass(createExpandVariadicsPass(ExpandVariadicsMode::Lowering));
@@ -1330,6 +1338,10 @@ void AMDGPUPassConfig::addIRPasses() {
// Make enqueued block runtime handles externally visible.
addPass(createAMDGPUExportKernelRuntimeHandlesLegacyPass());
+ // Lower special LDS accesses.
+ if (EnableLowerExecSync)
+ addPass(createAMDGPULowerExecSyncLegacyPass());
+
// Lower LDS accesses to global memory pass if address sanitizer is enabled.
if (EnableSwLowerLDS)
addPass(createAMDGPUSwLowerLDSLegacyPass(&TM));
@@ -1415,9 +1427,6 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
// many cases.
addPass(createAMDGPULowerBufferFatPointersPass());
addPass(createAMDGPULowerIntrinsicsLegacyPass());
- // In accordance with the above FIXME, manually force all the
- // function-level passes into a CGSCCPassManager.
- addPass(new DummyCGSCCPass());
}
// LowerSwitch pass may introduce unreachable blocks that can
@@ -2012,6 +2021,42 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
MFI->ArgInfo.WorkItemIDZ, 0, 0)))
return true;
+ // Parse FirstKernArgPreloadReg separately, since it's a Register,
+ // not ArgDescriptor.
+ if (YamlMFI.ArgInfo && YamlMFI.ArgInfo->FirstKernArgPreloadReg) {
+ const yaml::SIArgument &A = *YamlMFI.ArgInfo->FirstKernArgPreloadReg;
+
+ if (!A.IsRegister) {
+ // For stack arguments, we don't have RegisterName.SourceRange,
+ // but we should have some location info from the YAML parser
+ const MemoryBuffer &Buffer =
+ *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
+ // Create a minimal valid source range
+ SMLoc Loc = SMLoc::getFromPointer(Buffer.getBufferStart());
+ SMRange Range(Loc, Loc);
+
+ Error = SMDiagnostic(
+ *PFS.SM, Loc, Buffer.getBufferIdentifier(), 1, 0, SourceMgr::DK_Error,
+ "firstKernArgPreloadReg must be a register, not a stack location", "",
+ {}, {});
+
+ SourceRange = Range;
+ return true;
+ }
+
+ Register Reg;
+ if (parseNamedRegisterReference(PFS, Reg, A.RegisterName.Value, Error)) {
+ SourceRange = A.RegisterName.SourceRange;
+ return true;
+ }
+
+ if (!AMDGPU::SGPR_32RegClass.contains(Reg))
+ return diagnoseRegisterClass(A.RegisterName);
+
+ MFI->ArgInfo.FirstKernArgPreloadReg = Reg;
+ MFI->NumUserSGPRs += YamlMFI.NumKernargPreloadSGPRs;
+ }
+
if (ST.hasIEEEMode())
MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
if (ST.hasDX10ClampMode())
@@ -2066,6 +2111,8 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const {
if (isPassEnabled(EnableImageIntrinsicOptimizer))
addPass(AMDGPUImageIntrinsicOptimizerPass(TM));
+ if (EnableUniformIntrinsicCombine)
+ addPass(AMDGPUUniformIntrinsicCombinePass());
// This can be disabled by passing ::Disable here or on the command line
// with --expand-variadics-override=disable.
addPass(ExpandVariadicsPass(ExpandVariadicsMode::Lowering));
@@ -2075,6 +2122,9 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const {
addPass(AMDGPUExportKernelRuntimeHandlesPass());
+ if (EnableLowerExecSync)
+ addPass(AMDGPULowerExecSyncPass());
+
if (EnableSwLowerLDS)
addPass(AMDGPUSwLowerLDSPass(TM));
@@ -2158,6 +2208,9 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const {
void AMDGPUCodeGenPassBuilder::addPreISel(AddIRPass &addPass) const {
+ // Require AMDGPUArgumentUsageAnalysis so that it's available during ISel.
+ addPass(RequireAnalysisPass<AMDGPUArgumentUsageAnalysis, Module>());
+
if (TM.getOptLevel() > CodeGenOptLevel::None) {
addPass(FlattenCFGPass());
addPass(SinkingPass());
@@ -2345,11 +2398,10 @@ void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const {
addPass(SIMemoryLegalizerPass());
addPass(SIInsertWaitcntsPass());
- // TODO: addPass(SIModeRegisterPass());
+ addPass(SIModeRegisterPass());
- if (TM.getOptLevel() > CodeGenOptLevel::None) {
- // TODO: addPass(SIInsertHardClausesPass());
- }
+ if (TM.getOptLevel() > CodeGenOptLevel::None)
+ addPass(SIInsertHardClausesPass());
addPass(SILateBranchLoweringPass());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 03d16fd..dfa2151 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1241,46 +1241,123 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
(ScalarSize == 16 || ScalarSize == 8)) {
// Larger vector widths may require additional instructions, but are
// typically cheaper than scalarized versions.
- unsigned NumVectorElts = cast<FixedVectorType>(SrcTy)->getNumElements();
- unsigned RequestedElts =
- count_if(Mask, [](int MaskElt) { return MaskElt != -1; });
+ //
+ // We assume that shuffling at a register granularity can be done for free.
+ // This is not true for vectors fed into memory instructions, but it is
+ // effectively true for all other shuffling. The emphasis of the logic here
+ // is to assist generic transform in cleaning up / canonicalizing those
+ // shuffles.
+
+ // With op_sel VOP3P instructions freely can access the low half or high
+ // half of a register, so any swizzle of two elements is free.
+ if (auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcTy)) {
+ unsigned NumSrcElts = SrcVecTy->getNumElements();
+ if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumSrcElts == 2 &&
+ (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Reverse ||
+ Kind == TTI::SK_PermuteSingleSrc))
+ return 0;
+ }
+
unsigned EltsPerReg = 32 / ScalarSize;
- if (RequestedElts == 0)
- return 0;
switch (Kind) {
case TTI::SK_Broadcast:
+ // A single v_perm_b32 can be re-used for all destination registers.
+ return 1;
case TTI::SK_Reverse:
- case TTI::SK_PermuteSingleSrc: {
- // With op_sel VOP3P instructions freely can access the low half or high
- // half of a register, so any swizzle of two elements is free.
- if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumVectorElts == 2)
- return 0;
- unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
- // SK_Broadcast just reuses the same mask
- unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;
- return NumPerms + NumPermMasks;
- }
+ // One instruction per register.
+ if (auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy))
+ return divideCeil(DstVecTy->getNumElements(), EltsPerReg);
+ return InstructionCost::getInvalid();
case TTI::SK_ExtractSubvector:
+ if (Index % EltsPerReg == 0)
+ return 0; // Shuffling at register granularity
+ if (auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy))
+ return divideCeil(DstVecTy->getNumElements(), EltsPerReg);
+ return InstructionCost::getInvalid();
case TTI::SK_InsertSubvector: {
- // Even aligned accesses are free
- if (!(Index % 2))
- return 0;
- // Insert/extract subvectors only require shifts / extract code to get the
- // relevant bits
- return alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
+ auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy);
+ if (!DstVecTy)
+ return InstructionCost::getInvalid();
+ unsigned NumDstElts = DstVecTy->getNumElements();
+ unsigned NumInsertElts = cast<FixedVectorType>(SubTp)->getNumElements();
+ unsigned EndIndex = Index + NumInsertElts;
+ unsigned BeginSubIdx = Index % EltsPerReg;
+ unsigned EndSubIdx = EndIndex % EltsPerReg;
+ unsigned Cost = 0;
+
+ if (BeginSubIdx != 0) {
+ // Need to shift the inserted vector into place. The cost is the number
+ // of destination registers overlapped by the inserted vector.
+ Cost = divideCeil(EndIndex, EltsPerReg) - (Index / EltsPerReg);
+ }
+
+ // If the last register overlap is partial, there may be three source
+ // registers feeding into it; that takes an extra instruction.
+ if (EndIndex < NumDstElts && BeginSubIdx < EndSubIdx)
+ Cost += 1;
+
+ return Cost;
}
- case TTI::SK_PermuteTwoSrc:
- case TTI::SK_Splice:
- case TTI::SK_Select: {
- unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
- // SK_Select just reuses the same mask
- unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;
- return NumPerms + NumPermMasks;
+ case TTI::SK_Splice: {
+ auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy);
+ if (!DstVecTy)
+ return InstructionCost::getInvalid();
+ unsigned NumElts = DstVecTy->getNumElements();
+ assert(NumElts == cast<FixedVectorType>(SrcTy)->getNumElements());
+ // Determine the sub-region of the result vector that requires
+ // sub-register shuffles / mixing.
+ unsigned EltsFromLHS = NumElts - Index;
+ bool LHSIsAligned = (Index % EltsPerReg) == 0;
+ bool RHSIsAligned = (EltsFromLHS % EltsPerReg) == 0;
+ if (LHSIsAligned && RHSIsAligned)
+ return 0;
+ if (LHSIsAligned && !RHSIsAligned)
+ return divideCeil(NumElts, EltsPerReg) - (EltsFromLHS / EltsPerReg);
+ if (!LHSIsAligned && RHSIsAligned)
+ return divideCeil(EltsFromLHS, EltsPerReg);
+ return divideCeil(NumElts, EltsPerReg);
}
-
default:
break;
}
+
+ if (!Mask.empty()) {
+ unsigned NumSrcElts = cast<FixedVectorType>(SrcTy)->getNumElements();
+
+ // Generically estimate the cost by assuming that each destination
+ // register is derived from sources via v_perm_b32 instructions if it
+ // can't be copied as-is.
+ //
+ // For each destination register, derive the cost of obtaining it based
+ // on the number of source registers that feed into it.
+ unsigned Cost = 0;
+ for (unsigned DstIdx = 0; DstIdx < Mask.size(); DstIdx += EltsPerReg) {
+ SmallVector<int, 4> Regs;
+ bool Aligned = true;
+ for (unsigned I = 0; I < EltsPerReg && DstIdx + I < Mask.size(); ++I) {
+ int SrcIdx = Mask[DstIdx + I];
+ if (SrcIdx == -1)
+ continue;
+ int Reg;
+ if (SrcIdx < (int)NumSrcElts) {
+ Reg = SrcIdx / EltsPerReg;
+ if (SrcIdx % EltsPerReg != I)
+ Aligned = false;
+ } else {
+ Reg = NumSrcElts + (SrcIdx - NumSrcElts) / EltsPerReg;
+ if ((SrcIdx - NumSrcElts) % EltsPerReg != I)
+ Aligned = false;
+ }
+ if (!llvm::is_contained(Regs, Reg))
+ Regs.push_back(Reg);
+ }
+ if (Regs.size() >= 2)
+ Cost += Regs.size() - 1;
+ else if (!Aligned)
+ Cost += 1;
+ }
+ return Cost;
+ }
}
return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
index 65e6ed9..c52eb4e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
@@ -57,10 +57,11 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II,
const UniformityInfo &UI,
ValueMap<const Value *, bool> &Tracker) {
llvm::Intrinsic::ID IID = II.getIntrinsicID();
-
+ /// We deliberately do not simplify readfirstlane with a uniform argument, so
+ /// that frontends can use it to force a copy to SGPR and thereby prevent the
+ /// backend from generating unwanted waterfall loops.
switch (IID) {
case Intrinsic::amdgcn_permlane64:
- case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_readlane: {
Value *Src = II.getArgOperand(0);
if (isDivergentUseWithNew(II.getOperandUse(0), UI, Tracker))
@@ -107,7 +108,7 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II,
return Changed;
}
default:
- llvm_unreachable("Unexpected intrinsic ID in optimizeUniformIntrinsic");
+ return false;
}
return false;
}
@@ -121,16 +122,6 @@ static bool runUniformIntrinsicCombine(Function &F, const UniformityInfo &UI) {
auto *II = dyn_cast<IntrinsicInst>(&I);
if (!II)
continue;
-
- switch (II->getIntrinsicID()) {
- case Intrinsic::amdgcn_permlane64:
- case Intrinsic::amdgcn_readfirstlane:
- case Intrinsic::amdgcn_readlane:
- case Intrinsic::amdgcn_ballot:
- break;
- default:
- continue;
- }
IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker);
}
return IsChanged;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
index 733c5d5..fe81a5e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -181,14 +181,52 @@ BasicBlock *AMDGPUUnifyDivergentExitNodesImpl::unifyReturnBlockSet(
return NewRetBlock;
}
+static BasicBlock *
+createDummyReturnBlock(Function &F,
+ SmallVector<BasicBlock *, 4> &ReturningBlocks) {
+ BasicBlock *DummyReturnBB =
+ BasicBlock::Create(F.getContext(), "DummyReturnBlock", &F);
+ Type *RetTy = F.getReturnType();
+ Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy);
+ ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
+ ReturningBlocks.push_back(DummyReturnBB);
+ return DummyReturnBB;
+}
+
+/// Handle conditional branch instructions (-> 2 targets) and callbr
+/// instructions with N targets.
+static void handleNBranch(Function &F, BasicBlock *BB, Instruction *BI,
+ BasicBlock *DummyReturnBB,
+ std::vector<DominatorTree::UpdateType> &Updates) {
+ SmallVector<BasicBlock *, 2> Successors(successors(BB));
+
+ // Create a new transition block to hold the conditional branch.
+ BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock");
+
+ Updates.reserve(Updates.size() + 2 * Successors.size() + 2);
+
+ // 'Successors' become successors of TransitionBB instead of BB,
+ // and TransitionBB becomes a single successor of BB.
+ Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB);
+ for (BasicBlock *Successor : Successors) {
+ Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor);
+ Updates.emplace_back(DominatorTree::Delete, BB, Successor);
+ }
+
+ // Create a branch that will always branch to the transition block and
+ // references DummyReturnBB.
+ BB->getTerminator()->eraseFromParent();
+ BranchInst::Create(TransitionBB, DummyReturnBB,
+ ConstantInt::getTrue(F.getContext()), BB);
+ Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB);
+}
+
bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT,
const PostDominatorTree &PDT,
const UniformityInfo &UA) {
- assert(hasOnlySimpleTerminator(F) && "Unsupported block terminator.");
-
if (PDT.root_size() == 0 ||
(PDT.root_size() == 1 &&
- !isa<BranchInst>(PDT.getRoot()->getTerminator())))
+ !isa<BranchInst, CallBrInst>(PDT.getRoot()->getTerminator())))
return false;
// Loop over all of the blocks in a function, tracking all of the blocks that
@@ -222,46 +260,28 @@ bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT,
if (HasDivergentExitBlock)
UnreachableBlocks.push_back(BB);
} else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
-
- ConstantInt *BoolTrue = ConstantInt::getTrue(F.getContext());
- if (DummyReturnBB == nullptr) {
- DummyReturnBB = BasicBlock::Create(F.getContext(),
- "DummyReturnBlock", &F);
- Type *RetTy = F.getReturnType();
- Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy);
- ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
- ReturningBlocks.push_back(DummyReturnBB);
- }
+ if (!DummyReturnBB)
+ DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks);
if (BI->isUnconditional()) {
BasicBlock *LoopHeaderBB = BI->getSuccessor(0);
BI->eraseFromParent(); // Delete the unconditional branch.
// Add a new conditional branch with a dummy edge to the return block.
- BranchInst::Create(LoopHeaderBB, DummyReturnBB, BoolTrue, BB);
- Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB);
- } else { // Conditional branch.
- SmallVector<BasicBlock *, 2> Successors(successors(BB));
-
- // Create a new transition block to hold the conditional branch.
- BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock");
-
- Updates.reserve(Updates.size() + 2 * Successors.size() + 2);
-
- // 'Successors' become successors of TransitionBB instead of BB,
- // and TransitionBB becomes a single successor of BB.
- Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB);
- for (BasicBlock *Successor : Successors) {
- Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor);
- Updates.emplace_back(DominatorTree::Delete, BB, Successor);
- }
-
- // Create a branch that will always branch to the transition block and
- // references DummyReturnBB.
- BB->getTerminator()->eraseFromParent();
- BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB);
+ BranchInst::Create(LoopHeaderBB, DummyReturnBB,
+ ConstantInt::getTrue(F.getContext()), BB);
Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB);
+ } else {
+ handleNBranch(F, BB, BI, DummyReturnBB, Updates);
}
Changed = true;
+ } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(BB->getTerminator())) {
+ if (!DummyReturnBB)
+ DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks);
+
+ handleNBranch(F, BB, CBI, DummyReturnBB, Updates);
+ Changed = true;
+ } else {
+ llvm_unreachable("unsupported block terminator");
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp b/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp
index 61c5dcd..8d8386c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp
@@ -44,6 +44,7 @@ namespace {
class AMDGPUWaitSGPRHazards {
public:
+ const GCNSubtarget *ST;
const SIInstrInfo *TII;
const SIRegisterInfo *TRI;
const MachineRegisterInfo *MRI;
@@ -54,7 +55,7 @@ public:
bool CullSGPRHazardsAtMemWait;
unsigned CullSGPRHazardsMemWaitThreshold;
- AMDGPUWaitSGPRHazards() {}
+ AMDGPUWaitSGPRHazards() = default;
// Return the numeric ID 0-127 for a given SGPR.
static std::optional<unsigned> sgprNumber(Register Reg,
@@ -165,7 +166,7 @@ public:
}
unsigned mergeMasks(unsigned Mask1, unsigned Mask2) {
- unsigned Mask = 0xffff;
+ unsigned Mask = AMDGPU::DepCtr::getDefaultDepCtrEncoding(*ST);
Mask = AMDGPU::DepCtr::encodeFieldSaSdst(
Mask, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(Mask1),
AMDGPU::DepCtr::decodeFieldSaSdst(Mask2)));
@@ -387,7 +388,7 @@ public:
// Apply wait
if (Wait) {
- unsigned Mask = 0xffff;
+ unsigned Mask = AMDGPU::DepCtr::getDefaultDepCtrEncoding(*ST);
if (Wait & WA_VCC) {
State.VCCHazard &= ~HazardState::VALU;
Mask = AMDGPU::DepCtr::encodeFieldVaVcc(Mask, 0);
@@ -438,8 +439,8 @@ public:
}
bool run(MachineFunction &MF) {
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- if (!ST.hasVALUReadSGPRHazard())
+ ST = &MF.getSubtarget<GCNSubtarget>();
+ if (!ST->hasVALUReadSGPRHazard())
return false;
// Parse settings
@@ -467,10 +468,10 @@ public:
if (!EnableSGPRHazardWaits)
return false;
- TII = ST.getInstrInfo();
- TRI = ST.getRegisterInfo();
+ TII = ST->getInstrInfo();
+ TRI = ST->getRegisterInfo();
MRI = &MF.getRegInfo();
- DsNopCount = ST.isWave64() ? WAVE64_NOPS : WAVE32_NOPS;
+ DsNopCount = ST->isWave64() ? WAVE64_NOPS : WAVE32_NOPS;
auto CallingConv = MF.getFunction().getCallingConv();
if (!AMDGPU::isEntryFunctionCC(CallingConv) &&
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 09338c5..7a91a40 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -347,6 +347,11 @@ public:
return isRegKind() && getReg() == AMDGPU::SGPR_NULL;
}
+ bool isAV_LdSt_32_Align2_RegOp() const {
+ return isRegClass(AMDGPU::VGPR_32RegClassID) ||
+ isRegClass(AMDGPU::AGPR_32RegClassID);
+ }
+
bool isVRegWithInputMods() const;
template <bool IsFake16> bool isT16_Lo128VRegWithInputMods() const;
template <bool IsFake16> bool isT16VRegWithInputMods() const;
@@ -1865,7 +1870,7 @@ private:
unsigned getConstantBusLimit(unsigned Opcode) const;
bool usesConstantBus(const MCInst &Inst, unsigned OpIdx);
bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const;
- unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const;
+ MCRegister findImplicitSGPRReadInVOP(const MCInst &Inst) const;
bool isSupportedMnemo(StringRef Mnemo,
const FeatureBitset &FBS);
@@ -3665,7 +3670,8 @@ StringRef AMDGPUAsmParser::getMatchedVariantName() const {
return "";
}
-unsigned AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const {
+MCRegister
+AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const {
const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
for (MCPhysReg Reg : Desc.implicit_uses()) {
switch (Reg) {
@@ -3679,7 +3685,7 @@ unsigned AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const {
break;
}
}
- return AMDGPU::NoRegister;
+ return MCRegister();
}
// NB: This code is correct only when used to check constant
@@ -3854,9 +3860,9 @@ bool AMDGPUAsmParser::validateConstantBusLimitations(
LiteralSize = 4;
}
- SmallDenseSet<unsigned> SGPRsUsed;
- unsigned SGPRUsed = findImplicitSGPRReadInVOP(Inst);
- if (SGPRUsed != AMDGPU::NoRegister) {
+ SmallDenseSet<MCRegister> SGPRsUsed;
+ MCRegister SGPRUsed = findImplicitSGPRReadInVOP(Inst);
+ if (SGPRUsed) {
SGPRsUsed.insert(SGPRUsed);
++ConstantBusUseCount;
}
@@ -7043,6 +7049,12 @@ ParseStatus AMDGPUAsmParser::parseNamedBit(StringRef Name,
if (Name == "a16" && !hasA16())
return Error(S, "a16 modifier is not supported on this GPU");
+ if (Bit == 0 && Name == "gds") {
+ StringRef Mnemo = ((AMDGPUOperand &)*Operands[0]).getToken();
+ if (Mnemo.starts_with("ds_gws"))
+ return Error(S, "nogds is not allowed");
+ }
+
if (isGFX9() && ImmTy == AMDGPUOperand::ImmTyA16)
ImmTy = AMDGPUOperand::ImmTyR128A16;
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index b97b738..bb0e938 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -783,37 +783,20 @@ multiclass MUBUF_Pseudo_Atomics_NO_RTN <string opName,
multiclass MUBUF_Pseudo_Atomics_RTN <string opName,
RegisterOperand vdataClass,
- ValueType vdataType,
- SDPatternOperator atomic> {
+ ValueType vdataType> {
let FPAtomic = vdataType.isFP in {
- def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass, 0,
- [(set vdataType:$vdata,
- (atomic (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset),
- vdataType:$vdata_in))]>,
- MUBUFAddr64Table <0, NAME # "_RTN">;
-
- def _ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, 0,
- [(set vdataType:$vdata,
- (atomic (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset),
- vdataType:$vdata_in))]>,
- MUBUFAddr64Table <1, NAME # "_RTN">;
-
+ def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass, 0>,
+ MUBUFAddr64Table <0, NAME # "_RTN">;
+ def _ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, 0>,
+ MUBUFAddr64Table <1, NAME # "_RTN">;
def _OFFEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, 0>;
def _IDXEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, 0>;
def _BOTHEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, 0>;
- def _VBUFFER_OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.Offset, vdataClass, 1,
- [(set vdataType:$vdata,
- (atomic (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset),
- vdataType:$vdata_in))]>,
- MUBUFAddr64Table <0, NAME # "_VBUFFER_RTN">;
-
- def _VBUFFER_ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.Addr64, vdataClass, 1,
- [(set vdataType:$vdata,
- (atomic (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset),
- vdataType:$vdata_in))]>,
- MUBUFAddr64Table <1, NAME # "_VBUFFER_RTN">;
-
+ def _VBUFFER_OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.Offset, vdataClass, 1>,
+ MUBUFAddr64Table <0, NAME # "_VBUFFER_RTN">;
+ def _VBUFFER_ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.Addr64, vdataClass, 1>,
+ MUBUFAddr64Table <1, NAME # "_VBUFFER_RTN">;
def _VBUFFER_OFFEN_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.OffEn, vdataClass, 1>;
def _VBUFFER_IDXEN_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.IdxEn, vdataClass, 1>;
def _VBUFFER_BOTHEN_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.BothEn, vdataClass, 1>;
@@ -822,10 +805,9 @@ multiclass MUBUF_Pseudo_Atomics_RTN <string opName,
multiclass MUBUF_Pseudo_Atomics <string opName,
RegisterOperand vdataClass,
- ValueType vdataType,
- SDPatternOperator atomic = null_frag> :
+ ValueType vdataType> :
MUBUF_Pseudo_Atomics_NO_RTN<opName, vdataClass, vdataType>,
- MUBUF_Pseudo_Atomics_RTN<opName, vdataClass, vdataType, atomic>;
+ MUBUF_Pseudo_Atomics_RTN<opName, vdataClass, vdataType>;
//===----------------------------------------------------------------------===//
@@ -1096,7 +1078,7 @@ defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Pseudo_Atomics <
let OtherPredicates = [HasGFX10_BEncoding] in {
defm BUFFER_ATOMIC_CSUB : MUBUF_Pseudo_Atomics <
- "buffer_atomic_csub", VGPROp_32, i32, int_amdgcn_global_atomic_csub
+ "buffer_atomic_csub", VGPROp_32, i32
>;
}
@@ -1117,22 +1099,22 @@ def BUFFER_WBINVL1_SC : MUBUF_Invalidate <"buffer_wbinvl1_sc",
let SubtargetPredicate = isGFX6GFX7GFX10Plus in {
defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Pseudo_Atomics <
- "buffer_atomic_fcmpswap", AVLdSt_64, v2f32, null_frag
+ "buffer_atomic_fcmpswap", AVLdSt_64, v2f32
>;
}
let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts in {
defm BUFFER_ATOMIC_FMIN : MUBUF_Pseudo_Atomics <
- "buffer_atomic_fmin", AVLdSt_32, f32, null_frag
+ "buffer_atomic_fmin", AVLdSt_32, f32
>;
defm BUFFER_ATOMIC_FMAX : MUBUF_Pseudo_Atomics <
- "buffer_atomic_fmax", AVLdSt_32, f32, null_frag
+ "buffer_atomic_fmax", AVLdSt_32, f32
>;
}
let SubtargetPredicate = isGFX6GFX7GFX10 in {
defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_fcmpswap_x2", VGPROp_128, v2f64, null_frag
+ "buffer_atomic_fcmpswap_x2", VGPROp_128, v2f64
>;
}
@@ -1201,12 +1183,12 @@ defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN <
let SubtargetPredicate = HasAtomicFaddRtnInsts in
defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_RTN<
- "buffer_atomic_add_f32", AVLdSt_32, f32, null_frag
+ "buffer_atomic_add_f32", AVLdSt_32, f32
>;
let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts in
defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_RTN <
- "buffer_atomic_pk_add_f16", AVLdSt_32, v2f16, null_frag
+ "buffer_atomic_pk_add_f16", AVLdSt_32, v2f16
>;
let SubtargetPredicate = isGFX12Plus in {
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index a1e0e52..782cbfa 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -15,6 +15,7 @@ tablegen(LLVM AMDGPUGenMCCodeEmitter.inc -gen-emitter)
tablegen(LLVM AMDGPUGenMCPseudoLowering.inc -gen-pseudo-lowering)
tablegen(LLVM AMDGPUGenRegisterBank.inc -gen-register-bank)
tablegen(LLVM AMDGPUGenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM AMDGPUGenSDNodeInfo.inc -gen-sd-node-info)
tablegen(LLVM AMDGPUGenSearchableTables.inc -gen-searchable-tables)
tablegen(LLVM AMDGPUGenSubtargetInfo.inc -gen-subtarget)
@@ -39,10 +40,6 @@ tablegen(LLVM R600GenSubtargetInfo.inc -gen-subtarget)
add_public_tablegen_target(AMDGPUCommonTableGen)
-set(LLVM_TARGET_DEFINITIONS InstCombineTables.td)
-tablegen(LLVM InstCombineTables.inc -gen-searchable-tables)
-add_public_tablegen_target(InstCombineTableGen)
-
add_llvm_target(AMDGPUCodeGen
AMDGPUAliasAnalysis.cpp
AMDGPUAlwaysInlinePass.cpp
@@ -81,6 +78,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPULowerKernelAttributes.cpp
AMDGPULowerModuleLDSPass.cpp
AMDGPUPrepareAGPRAlloc.cpp
+ AMDGPULowerExecSync.cpp
AMDGPUSwLowerLDS.cpp
AMDGPUMachineFunction.cpp
AMDGPUMachineModuleInfo.cpp
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index b841171..3a53cef 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -463,7 +463,7 @@ class DS_GWS_0D <string opName>
class DS_GWS_1D <string opName>
: DS_GWS<opName,
- (ins AVLdSt_32:$data0, Offset:$offset),
+ (ins AV_LdSt_32_Align2_RegOp:$data0, Offset:$offset),
" $data0$offset gds"> {
let has_gws_data0 = 1;
@@ -886,17 +886,6 @@ defm DS_SUB_CLAMP_RTN_U32 : DS_1A1D_RET_mc_gfx9<"ds_sub_clamp_rtn_u32", VGPROp_3
def DS_BPERMUTE_FI_B32 : DS_1A1D_PERMUTE <"ds_bpermute_fi_b32",
int_amdgcn_ds_bpermute_fi_b32>;
-multiclass DSAtomicRetNoRetPatIntrinsic_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
- ValueType vt, string frag> {
- def : DSAtomicRetPat<inst, vt,
- !cast<PatFrag>(frag#"_local_addrspace")>;
-
- let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
- def : DSAtomicRetPat<noRetInst, vt,
- !cast<PatFrag>(frag#"_noret_local_addrspace"), /* complexity */ 1>;
-}
-
-defm : DSAtomicRetNoRetPatIntrinsic_mc<DS_COND_SUB_RTN_U32, DS_COND_SUB_U32, i32, "int_amdgcn_atomic_cond_sub_u32">;
} // let SubtargetPredicate = isGFX12Plus
let SubtargetPredicate = isGFX1250Plus in {
@@ -1279,6 +1268,14 @@ defm : DSAtomicRetNoRetPat_NoM0_mc<DS_PK_ADD_RTN_F16, DS_PK_ADD_F16, v2f16, "ato
defm : DSAtomicRetNoRetPat_NoM0_mc<DS_PK_ADD_RTN_BF16, DS_PK_ADD_BF16, v2bf16, "atomic_load_fadd">;
}
+let SubtargetPredicate = HasAtomicDsCondSubClampInsts in {
+
+defm : DSAtomicRetNoRetPat_NoM0_mc<DS_COND_SUB_RTN_U32, DS_COND_SUB_U32, i32, "atomic_load_usub_cond">;
+
+defm : DSAtomicRetNoRetPat_NoM0_mc<DS_SUB_CLAMP_RTN_U32, DS_SUB_CLAMP_U32, i32, "atomic_load_usub_sat">;
+
+} // let SubtargetPredicate = HasAtomicDsCondSubClampInsts
+
let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in {
defm : DSAtomicCmpXChgSwapped_mc<DS_CMPST_RTN_B32, DS_CMPST_B32, i32, "atomic_cmp_swap">;
}
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index e3f3aba..dd3120f 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -1199,8 +1199,8 @@ void AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
// Given a wide tuple \p Reg check if it will overflow 256 registers.
// \returns \p Reg on success or NoRegister otherwise.
-static unsigned CheckVGPROverflow(unsigned Reg, const MCRegisterClass &RC,
- const MCRegisterInfo &MRI) {
+static MCRegister CheckVGPROverflow(MCRegister Reg, const MCRegisterClass &RC,
+ const MCRegisterInfo &MRI) {
unsigned NumRegs = RC.getSizeInBits() / 32;
MCRegister Sub0 = MRI.getSubReg(Reg, AMDGPU::sub0);
if (!Sub0)
@@ -1214,7 +1214,7 @@ static unsigned CheckVGPROverflow(unsigned Reg, const MCRegisterClass &RC,
assert(BaseReg && "Only vector registers expected");
- return (Sub0 - BaseReg + NumRegs <= 256) ? Reg : AMDGPU::NoRegister;
+ return (Sub0 - BaseReg + NumRegs <= 256) ? Reg : MCRegister();
}
// Note that before gfx10, the MIMG encoding provided no information about
@@ -1456,9 +1456,8 @@ MCOperand AMDGPUDisassembler::errOperand(unsigned V,
return MCOperand();
}
-inline
-MCOperand AMDGPUDisassembler::createRegOperand(unsigned int RegId) const {
- return MCOperand::createReg(AMDGPU::getMCReg(RegId, STI));
+inline MCOperand AMDGPUDisassembler::createRegOperand(MCRegister Reg) const {
+ return MCOperand::createReg(AMDGPU::getMCReg(Reg, STI));
}
inline
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index d103d79..ab130db 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -69,7 +69,7 @@ public:
const char* getRegClassName(unsigned RegClassID) const;
- MCOperand createRegOperand(unsigned int RegId) const;
+ MCOperand createRegOperand(MCRegister Reg) const;
MCOperand createRegOperand(unsigned RegClassID, unsigned Val) const;
MCOperand createSRegOperand(unsigned SRegClassID, unsigned Val) const;
MCOperand createVGPR16Operand(unsigned RegIdx, bool IsHi) const;
diff --git a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
index dadc7dc..a2e3ece 100644
--- a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -505,7 +505,6 @@ def : AMDGPUPat <
(fshr i32:$src0, i32:$src1, i32:$src2),
(BIT_ALIGN_INT_eg $src0, $src1, $src2)
>;
-def : ROTRPattern <BIT_ALIGN_INT_eg>;
def MULADD_eg : MULADD_Common<0x14>;
def MULADD_IEEE_eg : MULADD_IEEE_Common<0x18>;
def FMA_eg : FMA_Common<0x7>;
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 8ea64d1..9e38af9 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -262,8 +262,18 @@ multiclass FLAT_Flat_Load_Pseudo<string opName, RegisterOperand regClass = AVLdS
multiclass FLAT_Flat_Load_Pseudo_t16<string opName> {
defm "" : FLAT_Flat_Load_Pseudo<opName, AVLdSt_32, 1>;
- let True16Predicate = UseRealTrue16Insts in
- defm _t16 : FLAT_Flat_Load_Pseudo<opName#"_t16", VGPROp_16>, True16D16Table<NAME#"_HI", NAME>;
+
+ defvar Name16 = opName#"_t16";
+ let True16Predicate = UseRealTrue16Insts in {
+ def _t16 : FLAT_Load_Pseudo<Name16, VGPROp_16>,
+ GlobalSaddrTable<0, Name16>,
+ True16D16Table<NAME#"_HI", NAME>;
+
+ let OtherPredicates = [HasFlatGVSMode] in
+ def _t16_SADDR : FLAT_Load_Pseudo<Name16, VGPROp_16, 0, 1, 1>,
+ GlobalSaddrTable<1, Name16>,
+ True16D16Table<NAME#"_HI_SADDR", NAME#"_SADDR">;
+ }
}
class FLAT_Store_Pseudo <string opName, RegisterOperand vdataClass,
@@ -1552,10 +1562,6 @@ multiclass FlatAtomicNoRtnPatBase <string base_inst_name, string node, ValueType
}
}
-multiclass FlatAtomicNoRtnPatWithAddrSpace<string inst, string node, string addrSpaceSuffix,
- ValueType vt> :
- FlatAtomicNoRtnPatBase<inst, node # "_noret_" # addrSpaceSuffix, vt, vt>;
-
multiclass FlatAtomicNoRtnPat <string inst, string node, ValueType vt,
ValueType data_vt = vt, bit isIntr = 0> :
FlatAtomicNoRtnPatBase<inst, node # "_noret" # !if(isIntr, "", "_"#vt), vt, data_vt>;
@@ -1580,10 +1586,6 @@ multiclass FlatAtomicRtnPatBase <string inst_name, string node, ValueType vt,
}
}
-multiclass FlatAtomicRtnPatWithAddrSpace<string inst, string intr, string addrSpaceSuffix,
- ValueType vt> :
- FlatAtomicRtnPatBase<inst, intr # "_" # addrSpaceSuffix, vt, vt>;
-
multiclass FlatAtomicRtnPat <string inst, string node, ValueType vt,
ValueType data_vt = vt, bit isIntr = 0> :
FlatAtomicRtnPatBase<inst, node # !if(isIntr, "", "_"#vt), vt, data_vt>;
@@ -2169,14 +2171,16 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_MIN_F64", "atomic_load_fmin_"#as, f64>;
defm : FlatAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_"#as, f64>;
}
+let SubtargetPredicate = HasAtomicCondSubClampFlatInsts in {
+ defm : FlatAtomicRtnPat<"FLAT_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_" #as, i32 >;
+
+ defm : FlatAtomicNoRtnPat<"FLAT_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_"#as, i32>;
+}
} // end foreach as
defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
defm : FlatStorePats <FLAT_STORE_SHORT, store_flat, i16>;
-defm : FlatAtomicRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>;
-defm : FlatAtomicNoRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>;
-
let OtherPredicates = [HasD16LoadStore] in {
defm : FlatStorePats <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>;
defm : FlatStorePats <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>;
@@ -2340,10 +2344,10 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_OR", "atomic_load_or_global", i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SWAP", "atomic_swap_global", i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP", "AMDGPUatomic_cmp_swap_global", i32, v2i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR", "atomic_load_xor_global", i32>;
-defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_csub", i32, i32, /* isIntr */ 1>;
+defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_CSUB", "atomic_load_usub_sat_global", i32>;
let SubtargetPredicate = HasAtomicCSubNoRtnInsts in
-defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_csub", i32, i32, /* isIntr */ 1>;
+defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_CSUB", "atomic_load_usub_sat_global", i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_X2", "atomic_load_add_global", i64>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB_X2", "atomic_load_sub_global", i64>;
@@ -2360,10 +2364,8 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP_X2", "AMDGPUatomic_cmp_swap_
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", "atomic_load_xor_global", i64>;
let SubtargetPredicate = isGFX12Plus in {
- defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "global_addrspace", i32>;
-
- let SubtargetPredicate = HasAtomicCSubNoRtnInsts in
- defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "global_addrspace", i32>;
+ defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_global", i32>;
+ defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_global", i32>;
}
let OtherPredicates = [isGFX12PlusNot12_50] in
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 52cc4ca..6f1a521 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -190,6 +190,12 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
if (checkFPAtomicToDenormModeHazard(MI) > 0)
return HazardType;
+ // Hazards which cannot be mitigated with S_NOPs.
+ if (!IsHazardRecognizerMode) {
+ if (checkWMMACoexecutionHazards(MI) > 0)
+ return Hazard;
+ }
+
if (ST.hasNoDataDepHazard())
return NoHazard;
@@ -435,10 +441,7 @@ void GCNHazardRecognizer::RecedeCycle() {
// Helper Functions
//===----------------------------------------------------------------------===//
-using HazardFnResult = enum { HazardFound, HazardExpired, NoHazardFound };
-
-using IsExpiredFn = function_ref<bool(const MachineInstr &, int WaitStates)>;
-using GetNumWaitStatesFn = function_ref<unsigned int(const MachineInstr &)>;
+enum HazardFnResult { HazardFound, HazardExpired, NoHazardFound };
// Search for a hazard in a block and its predecessors.
template <typename StateT>
@@ -546,11 +549,14 @@ hasHazard(StateT InitialState,
// Returns a minimum wait states since \p I walking all predecessors.
// Only scans until \p IsExpired does not return true.
// Can only be run in a hazard recognizer mode.
-static int getWaitStatesSince(
- GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB,
- MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates,
- IsExpiredFn IsExpired, DenseSet<const MachineBasicBlock *> &Visited,
- GetNumWaitStatesFn GetNumWaitStates = SIInstrInfo::getNumWaitStates) {
+static int
+getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
+ const MachineBasicBlock *MBB,
+ MachineBasicBlock::const_reverse_instr_iterator I,
+ int WaitStates, GCNHazardRecognizer::IsExpiredFn IsExpired,
+ DenseSet<const MachineBasicBlock *> &Visited,
+ GCNHazardRecognizer::GetNumWaitStatesFn GetNumWaitStates =
+ SIInstrInfo::getNumWaitStates) {
for (auto E = MBB->instr_rend(); I != E; ++I) {
// Don't add WaitStates for parent BUNDLE instructions.
if (I->isBundle())
@@ -582,20 +588,26 @@ static int getWaitStatesSince(
return MinWaitStates;
}
-static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
- const MachineInstr *MI, IsExpiredFn IsExpired) {
+static int
+getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
+ const MachineInstr *MI,
+ GCNHazardRecognizer::IsExpiredFn IsExpired,
+ GCNHazardRecognizer::GetNumWaitStatesFn GetNumWaitStates =
+ SIInstrInfo::getNumWaitStates) {
DenseSet<const MachineBasicBlock *> Visited;
return getWaitStatesSince(IsHazard, MI->getParent(),
std::next(MI->getReverseIterator()), 0, IsExpired,
- Visited, SIInstrInfo::getNumWaitStates);
+ Visited, GetNumWaitStates);
}
-int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
+int GCNHazardRecognizer::getWaitStatesSince(
+ IsHazardFn IsHazard, int Limit, GetNumWaitStatesFn GetNumWaitStates) {
if (IsHazardRecognizerMode) {
auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
return WaitStates >= Limit;
};
- return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
+ return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn,
+ GetNumWaitStates);
}
int WaitStates = 0;
@@ -607,7 +619,7 @@ int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
if (MI->isInlineAsm())
continue;
}
- ++WaitStates;
+ WaitStates += MI ? GetNumWaitStates(*MI) : 1;
if (WaitStates >= Limit)
break;
@@ -615,6 +627,10 @@ int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
return std::numeric_limits<int>::max();
}
+int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
+ return getWaitStatesSince(IsHazard, Limit, SIInstrInfo::getNumWaitStates);
+}
+
int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
IsHazardFn IsHazardDef,
int Limit) {
@@ -643,7 +659,7 @@ int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
MCRegister Reg) {
for (MCRegUnit Unit : TRI.regunits(Reg))
- BV.set(Unit);
+ BV.set(static_cast<unsigned>(Unit));
}
static void addRegsToSet(const SIRegisterInfo &TRI,
@@ -1243,6 +1259,20 @@ int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
}
+// emit V_NOP instructions. \p WaitStatesNeeded is the number of V_NOPs we need
+// to insert, negative means not needed.
+bool GCNHazardRecognizer::emitVNops(MachineInstr *MI, int WaitStatesNeeded) {
+ if (WaitStatesNeeded <= 0)
+ return false;
+
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ for (int I = 0; I < WaitStatesNeeded; ++I)
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+ TII->get(AMDGPU::V_NOP_e32));
+
+ return true;
+}
+
void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
fixVMEMtoScalarWriteHazards(MI);
fixVcmpxPermlaneHazards(MI);
@@ -1257,7 +1287,7 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
fixVALUTransUseHazard(MI);
fixVALUTransCoexecutionHazards(MI);
fixWMMAHazards(MI); // fall-through if co-execution is enabled.
- fixWMMACoexecutionHazards(MI);
+ emitVNops(MI, checkWMMACoexecutionHazards(MI));
fixShift64HighRegBug(MI);
fixVALUMaskWriteHazard(MI);
fixRequiredExportPriority(MI);
@@ -1354,7 +1384,7 @@ bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
const SIInstrInfo *TII = ST.getInstrInfo();
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
TII->get(AMDGPU::S_WAITCNT_DEPCTR))
- .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
+ .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0, ST));
return true;
}
@@ -1487,7 +1517,7 @@ bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
TII->get(AMDGPU::S_WAITCNT_DEPCTR))
- .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
+ .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0, ST));
return true;
}
@@ -1502,9 +1532,8 @@ static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
bool HasVmem = false;
for (auto &MBB : MF) {
for (auto &MI : MBB) {
- HasLds |= SIInstrInfo::isDS(MI);
- HasVmem |= (SIInstrInfo::isVMEM(MI) && !SIInstrInfo::isFLAT(MI)) ||
- SIInstrInfo::isSegmentSpecificFLAT(MI);
+ HasLds |= SIInstrInfo::isDS(MI) || SIInstrInfo::isLDSDMA(MI);
+ HasVmem |= SIInstrInfo::isVMEM(MI);
if (HasLds && HasVmem)
return true;
}
@@ -1526,10 +1555,9 @@ bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
assert(!ST.hasExtendedWaitCounts());
auto IsHazardInst = [](const MachineInstr &MI) {
- if (SIInstrInfo::isDS(MI))
+ if (SIInstrInfo::isDS(MI) || SIInstrInfo::isLDSDMA(MI))
return 1;
- if ((SIInstrInfo::isVMEM(MI) && !SIInstrInfo::isFLAT(MI)) ||
- SIInstrInfo::isSegmentSpecificFLAT(MI))
+ if (SIInstrInfo::isVMEM(MI))
return 2;
return 0;
};
@@ -1653,7 +1681,7 @@ bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
} else {
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
TII.get(AMDGPU::S_WAITCNT_DEPCTR))
- .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
+ .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0, ST));
}
return true;
@@ -1811,7 +1839,7 @@ bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
TII.get(AMDGPU::S_WAITCNT_DEPCTR))
- .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0));
+ .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0, ST));
return true;
}
@@ -1897,7 +1925,7 @@ bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
// avoided.
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
TII.get(AMDGPU::S_WAITCNT_DEPCTR))
- .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0));
+ .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0, ST));
return true;
}
@@ -2047,13 +2075,13 @@ static bool IsWMMAHazardInstInCategory(const MachineInstr &MI,
return false;
}
-bool GCNHazardRecognizer::fixWMMACoexecutionHazards(MachineInstr *MI) {
+int GCNHazardRecognizer::checkWMMACoexecutionHazards(MachineInstr *MI) {
if (!AMDGPU::isGFX1250(ST))
- return false;
+ return 0;
const SIInstrInfo *TII = ST.getInstrInfo();
if (!TII->isXDLWMMA(*MI) && !isCoexecutableVALUInst(*MI))
- return false;
+ return 0;
const SIRegisterInfo *TRI = ST.getRegisterInfo();
@@ -2131,9 +2159,6 @@ bool GCNHazardRecognizer::fixWMMACoexecutionHazards(MachineInstr *MI) {
};
int Limit = 0;
- auto IsExpiredFn = [&Limit](const MachineInstr &, int WaitStates) {
- return WaitStates >= Limit;
- };
auto GetWaitStatesFn = [](const MachineInstr &I) {
return SIInstrInfo::isVALU(I) ? 1 : 0;
@@ -2143,38 +2168,26 @@ bool GCNHazardRecognizer::fixWMMACoexecutionHazards(MachineInstr *MI) {
if (TII->isXDLWMMA(*MI)) {
for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
Limit = WMMAWaitStates[Category]; // for IsExpiredFn.
- DenseSet<const MachineBasicBlock *> Visited;
- // '::getWaitStatesSince' returns the number of VALUs in between if hazard
+ // 'getWaitStatesSince' returns the number of VALUs in between if hazard
// exists, and INT_MAX if there is no hazard. As a result, a negative
// WaitStatesNeeded here means no hazard, and we will continue to search
// for other categories.
WaitStatesNeeded =
- Limit - ::getWaitStatesSince(IsWMMAHazardFn, MI->getParent(),
- std::next(MI->getReverseIterator()), 0,
- IsExpiredFn, Visited, GetWaitStatesFn);
+ Limit - getWaitStatesSince(IsWMMAHazardFn, Limit, GetWaitStatesFn);
}
} else { // Must be a co-executable VALU.
for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
Limit = VALUWaitStates[Category]; // for IsExpiredFn.
- DenseSet<const MachineBasicBlock *> Visited;
- // '::getWaitStatesSince' returns the number of VALUs in between if hazard
+ // 'getWaitStatesSince' returns the number of VALUs in between if hazard
// exists, and INT_MAX if there is no hazard. As a result, a negative
// WaitStatesNeeded here means no hazard, and we will continue to search
// for other categories.
WaitStatesNeeded =
- Limit - ::getWaitStatesSince(IsVALUHazardFn, MI->getParent(),
- std::next(MI->getReverseIterator()), 0,
- IsExpiredFn, Visited, GetWaitStatesFn);
+ Limit - getWaitStatesSince(IsVALUHazardFn, Limit, GetWaitStatesFn);
}
}
- // WaitStatesNeeded now is the number of V_NOPs we need to insert, negative
- // means not needed.
- for (int i = 0; i < WaitStatesNeeded; i++)
- BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
- TII->get(AMDGPU::V_NOP_e32));
-
- return true;
+ return WaitStatesNeeded;
}
bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
@@ -3406,7 +3419,8 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
};
const unsigned ConstantMaskBits = AMDGPU::DepCtr::encodeFieldSaSdst(
- AMDGPU::DepCtr::encodeFieldVaSdst(AMDGPU::DepCtr::encodeFieldVaVcc(0), 0),
+ AMDGPU::DepCtr::encodeFieldVaSdst(AMDGPU::DepCtr::encodeFieldVaVcc(0, ST),
+ 0),
0);
auto UpdateStateFn = [&](StateType &State, const MachineInstr &I) {
switch (I.getOpcode()) {
@@ -3458,9 +3472,9 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
// Compute counter mask
unsigned DepCtr =
- IsVALU ? (IsVCC(HazardReg) ? AMDGPU::DepCtr::encodeFieldVaVcc(0)
- : AMDGPU::DepCtr::encodeFieldVaSdst(0))
- : AMDGPU::DepCtr::encodeFieldSaSdst(0);
+ IsVALU ? (IsVCC(HazardReg) ? AMDGPU::DepCtr::encodeFieldVaVcc(0, ST)
+ : AMDGPU::DepCtr::encodeFieldVaSdst(0, ST))
+ : AMDGPU::DepCtr::encodeFieldSaSdst(0, ST);
// Try to merge previous waits into this one for regions with no SGPR reads.
if (!WaitInstrs.empty()) {
@@ -3725,7 +3739,7 @@ bool GCNHazardRecognizer::fixScratchBaseForwardingHazard(MachineInstr *MI) {
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
TII->get(AMDGPU::S_WAITCNT_DEPCTR))
.addImm(AMDGPU::DepCtr::encodeFieldVaSdst(
- AMDGPU::DepCtr::encodeFieldSaSdst(0), 0));
+ AMDGPU::DepCtr::encodeFieldSaSdst(0, ST), 0));
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index 67beffa..d725134 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -32,6 +32,8 @@ class GCNSubtarget;
class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
public:
typedef function_ref<bool(const MachineInstr &)> IsHazardFn;
+ typedef function_ref<bool(const MachineInstr &, int WaitStates)> IsExpiredFn;
+ typedef function_ref<unsigned int(const MachineInstr &)> GetNumWaitStatesFn;
private:
// Distinguish if we are called from scheduler or hazard recognizer
@@ -74,6 +76,8 @@ private:
// used on a newly inserted instruction before returning from PreEmitNoops.
void runOnInstruction(MachineInstr *MI);
+ int getWaitStatesSince(IsHazardFn IsHazard, int Limit,
+ GetNumWaitStatesFn GetNumWaitStates);
int getWaitStatesSince(IsHazardFn IsHazard, int Limit);
int getWaitStatesSinceDef(unsigned Reg, IsHazardFn IsHazardDef, int Limit);
int getWaitStatesSinceSetReg(IsHazardFn IsHazard, int Limit);
@@ -94,6 +98,9 @@ private:
int checkReadM0Hazards(MachineInstr *SMovRel);
int checkNSAtoVMEMHazard(MachineInstr *MI);
int checkFPAtomicToDenormModeHazard(MachineInstr *MI);
+ // Emit V_NOP instructions. \p WaitStatesNeeded is the number of V_NOPs we
+ // need to insert, negative means not needed.
+ bool emitVNops(MachineInstr *MI, int WaitStatesNeeded);
void fixHazards(MachineInstr *MI);
bool fixVcmpxPermlaneHazards(MachineInstr *MI);
bool fixVMEMtoScalarWriteHazards(MachineInstr *MI);
@@ -106,7 +113,7 @@ private:
bool fixVALUTransUseHazard(MachineInstr *MI);
bool fixVALUTransCoexecutionHazards(MachineInstr *MI);
bool fixWMMAHazards(MachineInstr *MI);
- bool fixWMMACoexecutionHazards(MachineInstr *MI);
+ int checkWMMACoexecutionHazards(MachineInstr *MI);
bool fixShift64HighRegBug(MachineInstr *MI);
bool fixVALUMaskWriteHazard(MachineInstr *MI);
bool fixRequiredExportPriority(MachineInstr *MI);
diff --git a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
index 959ce69..1682abb 100644
--- a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
@@ -43,7 +43,7 @@ public:
bool run(MachineFunction &MF);
private:
- using NSA_Status = enum {
+ enum NSA_Status {
NOT_NSA, // Not an NSA instruction
FIXED, // NSA which we cannot modify
NON_CONTIGUOUS, // NSA with non-sequential address which we can try
diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index 4deb2a9..62172a0 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -136,7 +136,7 @@ bool GCNPreRAOptimizationsImpl::processReg(Register Reg) {
continue;
if (Def.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
- MachineOperand DefSrcMO = Def.getOperand(1);
+ const MachineOperand &DefSrcMO = Def.getOperand(1);
// Immediates are not an issue and can be propagated in
// postrapseudos pass. Only handle cases where defining
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 4e11c4f..2cb76a5 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -282,11 +282,12 @@ collectVirtualRegUses(SmallVectorImpl<VRegMaskOrUnit> &VRegMaskOrUnits,
Register Reg = MO.getReg();
auto I = llvm::find_if(VRegMaskOrUnits, [Reg](const VRegMaskOrUnit &RM) {
- return RM.RegUnit == Reg;
+ return RM.VRegOrUnit.asVirtualReg() == Reg;
});
auto &P = I == VRegMaskOrUnits.end()
- ? VRegMaskOrUnits.emplace_back(Reg, LaneBitmask::getNone())
+ ? VRegMaskOrUnits.emplace_back(VirtRegOrUnit(Reg),
+ LaneBitmask::getNone())
: *I;
P.LaneMask |= MO.getSubReg() ? TRI.getSubRegIndexLaneMask(MO.getSubReg())
@@ -295,7 +296,7 @@ collectVirtualRegUses(SmallVectorImpl<VRegMaskOrUnit> &VRegMaskOrUnits,
SlotIndex InstrSI;
for (auto &P : VRegMaskOrUnits) {
- auto &LI = LIS.getInterval(P.RegUnit);
+ auto &LI = LIS.getInterval(P.VRegOrUnit.asVirtualReg());
if (!LI.hasSubRanges())
continue;
@@ -312,29 +313,22 @@ collectVirtualRegUses(SmallVectorImpl<VRegMaskOrUnit> &VRegMaskOrUnits,
/// Mostly copy/paste from CodeGen/RegisterPressure.cpp
static LaneBitmask getLanesWithProperty(
const LiveIntervals &LIS, const MachineRegisterInfo &MRI,
- bool TrackLaneMasks, Register RegUnit, SlotIndex Pos,
- LaneBitmask SafeDefault,
+ bool TrackLaneMasks, Register Reg, SlotIndex Pos,
function_ref<bool(const LiveRange &LR, SlotIndex Pos)> Property) {
- if (RegUnit.isVirtual()) {
- const LiveInterval &LI = LIS.getInterval(RegUnit);
- LaneBitmask Result;
- if (TrackLaneMasks && LI.hasSubRanges()) {
- for (const LiveInterval::SubRange &SR : LI.subranges()) {
- if (Property(SR, Pos))
- Result |= SR.LaneMask;
- }
- } else if (Property(LI, Pos)) {
- Result = TrackLaneMasks ? MRI.getMaxLaneMaskForVReg(RegUnit)
- : LaneBitmask::getAll();
+ assert(Reg.isVirtual());
+ const LiveInterval &LI = LIS.getInterval(Reg);
+ LaneBitmask Result;
+ if (TrackLaneMasks && LI.hasSubRanges()) {
+ for (const LiveInterval::SubRange &SR : LI.subranges()) {
+ if (Property(SR, Pos))
+ Result |= SR.LaneMask;
}
-
- return Result;
+ } else if (Property(LI, Pos)) {
+ Result =
+ TrackLaneMasks ? MRI.getMaxLaneMaskForVReg(Reg) : LaneBitmask::getAll();
}
- const LiveRange *LR = LIS.getCachedRegUnit(RegUnit);
- if (LR == nullptr)
- return SafeDefault;
- return Property(*LR, Pos) ? LaneBitmask::getAll() : LaneBitmask::getNone();
+ return Result;
}
/// Mostly copy/paste from CodeGen/RegisterPressure.cpp
@@ -502,10 +496,9 @@ void GCNRPTracker::reset(const MachineRegisterInfo &MRI_,
}
/// Mostly copy/paste from CodeGen/RegisterPressure.cpp
-LaneBitmask GCNRPTracker::getLastUsedLanes(Register RegUnit,
- SlotIndex Pos) const {
+LaneBitmask GCNRPTracker::getLastUsedLanes(Register Reg, SlotIndex Pos) const {
return getLanesWithProperty(
- LIS, *MRI, true, RegUnit, Pos.getBaseIndex(), LaneBitmask::getNone(),
+ LIS, *MRI, true, Reg, Pos.getBaseIndex(),
[](const LiveRange &LR, SlotIndex Pos) {
const LiveRange::Segment *S = LR.getSegmentContaining(Pos);
return S != nullptr && S->end == Pos.getRegSlot();
@@ -562,10 +555,10 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
SmallVector<VRegMaskOrUnit, 8> RegUses;
collectVirtualRegUses(RegUses, MI, LIS, *MRI);
for (const VRegMaskOrUnit &U : RegUses) {
- LaneBitmask &LiveMask = LiveRegs[U.RegUnit];
+ LaneBitmask &LiveMask = LiveRegs[U.VRegOrUnit.asVirtualReg()];
LaneBitmask PrevMask = LiveMask;
LiveMask |= U.LaneMask;
- CurPressure.inc(U.RegUnit, PrevMask, LiveMask, *MRI);
+ CurPressure.inc(U.VRegOrUnit.asVirtualReg(), PrevMask, LiveMask, *MRI);
}
// Update MaxPressure with uses plus early-clobber defs pressure.
@@ -580,7 +573,7 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
bool GCNDownwardRPTracker::reset(const MachineInstr &MI,
const LiveRegSet *LiveRegsCopy) {
- MRI = &MI.getParent()->getParent()->getRegInfo();
+ MRI = &MI.getMF()->getRegInfo();
LastTrackedMI = nullptr;
MBBEnd = MI.getParent()->end();
NextMI = &MI;
@@ -748,9 +741,9 @@ GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI,
GCNRegPressure TempPressure = CurPressure;
for (const VRegMaskOrUnit &Use : RegOpers.Uses) {
- Register Reg = Use.RegUnit;
- if (!Reg.isVirtual())
+ if (!Use.VRegOrUnit.isVirtualReg())
continue;
+ Register Reg = Use.VRegOrUnit.asVirtualReg();
LaneBitmask LastUseMask = getLastUsedLanes(Reg, SlotIdx);
if (LastUseMask.none())
continue;
@@ -782,9 +775,9 @@ GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI,
// Generate liveness for defs.
for (const VRegMaskOrUnit &Def : RegOpers.Defs) {
- Register Reg = Def.RegUnit;
- if (!Reg.isVirtual())
+ if (!Def.VRegOrUnit.isVirtualReg())
continue;
+ Register Reg = Def.VRegOrUnit.asVirtualReg();
auto It = LiveRegs.find(Reg);
LaneBitmask LiveMask = It != LiveRegs.end() ? It->second : LaneBitmask(0);
LaneBitmask NewMask = LiveMask | Def.LaneMask;
@@ -824,8 +817,7 @@ Printable llvm::print(const GCNRPTracker::LiveRegSet &LiveRegs,
Register Reg = Register::index2VirtReg(I);
auto It = LiveRegs.find(Reg);
if (It != LiveRegs.end() && It->second.any())
- OS << ' ' << printVRegOrUnit(Reg, TRI) << ':'
- << PrintLaneMask(It->second);
+ OS << ' ' << printReg(Reg, TRI) << ':' << PrintLaneMask(It->second);
}
OS << '\n';
});
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 4b22c68..f9d3ce0 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -292,7 +292,7 @@ protected:
/// Mostly copy/paste from CodeGen/RegisterPressure.cpp
void bumpDeadDefs(ArrayRef<VRegMaskOrUnit> DeadDefs);
- LaneBitmask getLastUsedLanes(Register RegUnit, SlotIndex Pos) const;
+ LaneBitmask getLastUsedLanes(Register Reg, SlotIndex Pos) const;
public:
// reset tracker and set live register set to the specified value.
@@ -455,7 +455,7 @@ template <typename Range>
DenseMap<MachineInstr*, GCNRPTracker::LiveRegSet>
getLiveRegMap(Range &&R, bool After, LiveIntervals &LIS) {
std::vector<SlotIndex> Indexes;
- Indexes.reserve(std::distance(R.begin(), R.end()));
+ Indexes.reserve(llvm::size(R));
auto &SII = *LIS.getSlotIndexes();
for (MachineInstr *I : R) {
auto SI = SII.getInstructionIndex(*I);
@@ -463,7 +463,7 @@ getLiveRegMap(Range &&R, bool After, LiveIntervals &LIS) {
}
llvm::sort(Indexes);
- auto &MRI = (*R.begin())->getParent()->getParent()->getRegInfo();
+ auto &MRI = (*R.begin())->getMF()->getRegInfo();
DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> LiveRegMap;
SmallVector<SlotIndex, 32> LiveIdxs, SRLiveIdxs;
for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
@@ -493,13 +493,13 @@ getLiveRegMap(Range &&R, bool After, LiveIntervals &LIS) {
inline GCNRPTracker::LiveRegSet getLiveRegsAfter(const MachineInstr &MI,
const LiveIntervals &LIS) {
return getLiveRegs(LIS.getInstructionIndex(MI).getDeadSlot(), LIS,
- MI.getParent()->getParent()->getRegInfo());
+ MI.getMF()->getRegInfo());
}
inline GCNRPTracker::LiveRegSet getLiveRegsBefore(const MachineInstr &MI,
const LiveIntervals &LIS) {
return getLiveRegs(LIS.getInstructionIndex(MI).getBaseIndex(), LIS,
- MI.getParent()->getParent()->getRegInfo());
+ MI.getMF()->getRegInfo());
}
template <typename Range>
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 9fbf9e5..c8ce3aa 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -978,10 +978,8 @@ GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
static MachineInstr *getLastMIForRegion(MachineBasicBlock::iterator RegionBegin,
MachineBasicBlock::iterator RegionEnd) {
- auto REnd = RegionEnd == RegionBegin->getParent()->end()
- ? std::prev(RegionEnd)
- : RegionEnd;
- return &*skipDebugInstructionsBackward(REnd, RegionBegin);
+ assert(RegionBegin != RegionEnd && "Region must not be empty");
+ return &*skipDebugInstructionsBackward(std::prev(RegionEnd), RegionBegin);
}
void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
@@ -1076,9 +1074,12 @@ GCNScheduleDAGMILive::getRegionLiveOutMap() const {
assert(!Regions.empty());
std::vector<MachineInstr *> RegionLastMIs;
RegionLastMIs.reserve(Regions.size());
- for (auto &[RegionBegin, RegionEnd] : reverse(Regions))
+ for (auto &[RegionBegin, RegionEnd] : reverse(Regions)) {
+ // Skip empty regions.
+ if (RegionBegin == RegionEnd)
+ continue;
RegionLastMIs.push_back(getLastMIForRegion(RegionBegin, RegionEnd));
-
+ }
return getLiveRegMap(RegionLastMIs, /*After=*/true, *LIS);
}
@@ -1088,10 +1089,12 @@ void RegionPressureMap::buildLiveRegMap() {
RegionLiveRegMap =
IsLiveOut ? DAG->getRegionLiveOutMap() : DAG->getRegionLiveInMap();
for (unsigned I = 0; I < DAG->Regions.size(); I++) {
+ auto &[RegionBegin, RegionEnd] = DAG->Regions[I];
+ // Skip empty regions.
+ if (RegionBegin == RegionEnd)
+ continue;
MachineInstr *RegionKey =
- IsLiveOut
- ? getLastMIForRegion(DAG->Regions[I].first, DAG->Regions[I].second)
- : &*DAG->Regions[I].first;
+ IsLiveOut ? getLastMIForRegion(RegionBegin, RegionEnd) : &*RegionBegin;
IdxToInstruction[I] = RegionKey;
}
}
@@ -1228,18 +1231,20 @@ bool UnclusteredHighRPStage::initGCNSchedStage() {
createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PreRAReentry));
InitialOccupancy = DAG.MinOccupancy;
- // Aggressivly try to reduce register pressure in the unclustered high RP
+ // Aggressively try to reduce register pressure in the unclustered high RP
// stage. Temporarily increase occupancy target in the region.
+ TempTargetOccupancy = MFI.getMaxWavesPerEU() > DAG.MinOccupancy
+ ? InitialOccupancy + 1
+ : InitialOccupancy;
+ IsAnyRegionScheduled = false;
S.SGPRLimitBias = S.HighRPSGPRBias;
S.VGPRLimitBias = S.HighRPVGPRBias;
- if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy)
- MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);
LLVM_DEBUG(
dbgs()
<< "Retrying function scheduling without clustering. "
- "Aggressivly try to reduce register pressure to achieve occupancy "
- << DAG.MinOccupancy << ".\n");
+ "Aggressively try to reduce register pressure to achieve occupancy "
+ << TempTargetOccupancy << ".\n");
return true;
}
@@ -1320,9 +1325,16 @@ void UnclusteredHighRPStage::finalizeGCNSchedStage() {
SavedMutations.swap(DAG.Mutations);
S.SGPRLimitBias = S.VGPRLimitBias = 0;
if (DAG.MinOccupancy > InitialOccupancy) {
+ assert(IsAnyRegionScheduled);
LLVM_DEBUG(dbgs() << StageID
<< " stage successfully increased occupancy to "
<< DAG.MinOccupancy << '\n');
+ } else if (!IsAnyRegionScheduled) {
+ assert(DAG.MinOccupancy == InitialOccupancy);
+ LLVM_DEBUG(dbgs() << StageID
+ << ": No regions scheduled, min occupancy stays at "
+ << DAG.MinOccupancy << ", MFI occupancy stays at "
+ << MFI.getOccupancy() << ".\n");
}
GCNSchedStage::finalizeGCNSchedStage();
@@ -1396,13 +1408,27 @@ bool UnclusteredHighRPStage::initGCNRegion() {
// rescheduling of previous regions did not make occupancy drop back down to
// the initial minimum).
unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
+ // If no region has been scheduled yet, the DAG has not yet been updated with
+ // the occupancy target. So retrieve it from the temporary.
+ unsigned CurrentTargetOccupancy =
+ IsAnyRegionScheduled ? DAG.MinOccupancy : TempTargetOccupancy;
if (!DAG.RegionsWithExcessRP[RegionIdx] &&
- (DAG.MinOccupancy <= InitialOccupancy ||
+ (CurrentTargetOccupancy <= InitialOccupancy ||
DAG.Pressure[RegionIdx].getOccupancy(ST, DynamicVGPRBlockSize) !=
InitialOccupancy))
return false;
- return GCNSchedStage::initGCNRegion();
+ bool IsSchedulingThisRegion = GCNSchedStage::initGCNRegion();
+ // If this is the first region scheduled during this stage, make the target
+ // occupancy changes in the DAG and MFI.
+ if (!IsAnyRegionScheduled && IsSchedulingThisRegion) {
+ IsAnyRegionScheduled = true;
+ if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy) {
+ DAG.MinOccupancy = TempTargetOccupancy;
+ MFI.increaseOccupancy(MF, TempTargetOccupancy);
+ }
+ }
+ return IsSchedulingThisRegion;
}
bool ClusteredLowOccStage::initGCNRegion() {
@@ -2011,7 +2037,7 @@ void PreRARematStage::rematerialize() {
// Rematerialize DefMI to its use block.
TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
- AMDGPU::NoSubRegister, *DefMI, *DAG.TRI);
+ AMDGPU::NoSubRegister, *DefMI);
Remat.RematMI = &*std::prev(InsertPos);
DAG.LIS->InsertMachineInstrInMaps(*Remat.RematMI);
@@ -2163,8 +2189,7 @@ void PreRARematStage::finalizeGCNSchedStage() {
// Re-rematerialize MI at the end of its original region. Note that it may
// not be rematerialized exactly in the same position as originally within
// the region, but it should not matter much.
- TII->reMaterialize(*MBB, InsertPos, Reg, AMDGPU::NoSubRegister, RematMI,
- *DAG.TRI);
+ TII->reMaterialize(*MBB, InsertPos, Reg, AMDGPU::NoSubRegister, RematMI);
MachineInstr *NewMI = &*std::prev(InsertPos);
DAG.LIS->InsertMachineInstrInMaps(*NewMI);
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 975781f..95a931b 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -183,7 +183,7 @@ class ScheduleMetrics {
unsigned BubbleCycles;
public:
- ScheduleMetrics() {}
+ ScheduleMetrics() = default;
ScheduleMetrics(unsigned L, unsigned BC)
: ScheduleLength(L), BubbleCycles(BC) {}
unsigned getLength() const { return ScheduleLength; }
@@ -217,7 +217,7 @@ class RegionPressureMap {
bool IsLiveOut;
public:
- RegionPressureMap() {}
+ RegionPressureMap() = default;
RegionPressureMap(GCNScheduleDAGMILive *GCNDAG, bool LiveOut)
: DAG(GCNDAG), IsLiveOut(LiveOut) {}
// Build the Instr->LiveReg and RegionIdx->Instr maps
@@ -417,6 +417,10 @@ class UnclusteredHighRPStage : public GCNSchedStage {
private:
// Save the initial occupancy before starting this stage.
unsigned InitialOccupancy;
+ // Save the temporary target occupancy before starting this stage.
+ unsigned TempTargetOccupancy;
+ // Track whether any region was scheduled by this stage.
+ bool IsAnyRegionScheduled;
public:
bool initGCNSchedStage() override;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index f377b8a..ddff3ad 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -166,6 +166,13 @@ protected:
bool HasMAIInsts = false;
bool HasFP8Insts = false;
bool HasFP8ConversionInsts = false;
+ bool HasCubeInsts = false;
+ bool HasLerpInst = false;
+ bool HasSadInsts = false;
+ bool HasQsadInsts = false;
+ bool HasCvtNormInsts = false;
+ bool HasCvtPkNormVOP2Insts = false;
+ bool HasCvtPkNormVOP3Insts = false;
bool HasFP8E5M3Insts = false;
bool HasCvtFP8Vop1Bug = false;
bool HasPkFmacF16Inst = false;
@@ -892,6 +899,20 @@ public:
bool hasFP8ConversionInsts() const { return HasFP8ConversionInsts; }
+ bool hasCubeInsts() const { return HasCubeInsts; }
+
+ bool hasLerpInst() const { return HasLerpInst; }
+
+ bool hasSadInsts() const { return HasSadInsts; }
+
+ bool hasQsadInsts() const { return HasQsadInsts; }
+
+ bool hasCvtNormInsts() const { return HasCvtNormInsts; }
+
+ bool hasCvtPkNormVOP2Insts() const { return HasCvtPkNormVOP2Insts; }
+
+ bool hasCvtPkNormVOP3Insts() const { return HasCvtPkNormVOP3Insts; }
+
bool hasFP8E5M3Insts() const { return HasFP8E5M3Insts; }
bool hasPkFmacF16Inst() const {
@@ -1420,6 +1441,13 @@ public:
/// \returns true if the target has instructions with xf32 format support.
bool hasXF32Insts() const { return HasXF32Insts; }
+ /// \returns true if the target has packed f32 instructions that only read 32
+ /// bits from a scalar operand (SGPR or literal) and replicates the bits to
+ /// both channels.
+ bool hasPKF32InstsReplicatingLower32BitsOfScalarInput() const {
+ return getGeneration() == GFX12 && GFX1250Insts;
+ }
+
bool hasBitOp3Insts() const { return HasBitOp3Insts; }
bool hasPermlane16Swap() const { return HasPermlane16Swap; }
@@ -1595,6 +1623,10 @@ public:
return hasKernargPreload() && !GFX1250Insts;
}
+ bool hasCondSubInsts() const { return GFX12Insts; }
+
+ bool hasSubClampInsts() const { return hasGFX10_3Insts(); }
+
/// \returns SGPR allocation granularity supported by the subtarget.
unsigned getSGPRAllocGranule() const {
return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
@@ -1840,12 +1872,21 @@ public:
return GFX1250Insts && getGeneration() == GFX12;
}
+ // src_flat_scratch_hi cannot be used as a source in SALU producing a 64-bit
+ // result.
+ bool hasFlatScratchHiInB64InstHazard() const {
+ return GFX1250Insts && getGeneration() == GFX12;
+ }
+
/// \returns true if the subtarget supports clusters of workgroups.
bool hasClusters() const { return HasClusters; }
- /// \returns true if the subtarget requires a wait for xcnt before atomic
- /// flat/global stores & rmw.
- bool requiresWaitXCntBeforeAtomicStores() const { return GFX1250Insts; }
+ /// \returns true if the subtarget requires a wait for xcnt before VMEM
+ /// accesses that must never be repeated in the event of a page fault/re-try.
+ /// Atomic stores/rmw and all volatile accesses fall under this criteria.
+ bool requiresWaitXCntForSingleAccessInstructions() const {
+ return GFX1250Insts;
+ }
/// \returns the number of significant bits in the immediate field of the
/// S_NOP instruction.
diff --git a/llvm/lib/Target/AMDGPU/InstCombineTables.td b/llvm/lib/Target/AMDGPU/InstCombineTables.td
deleted file mode 100644
index 3d62641..0000000
--- a/llvm/lib/Target/AMDGPU/InstCombineTables.td
+++ /dev/null
@@ -1,10 +0,0 @@
-include "AMDGPU.td"
-
-def AMDGPUImageDMaskIntrinsicTable : GenericTable {
- let FilterClass = "AMDGPUImageDMaskIntrinsic";
- let Fields = ["Intr"];
-
- let PrimaryKey = ["Intr"];
- let PrimaryKeyName = "getAMDGPUImageDMaskIntrinsic";
- let PrimaryKeyEarlyOut = 1;
-}
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index 703ec0a..b63d71d 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -12,6 +12,7 @@
#include "SIDefines.h"
#include "Utils/AMDGPUAsmUtils.h"
#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/StringExtras.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
@@ -336,7 +337,7 @@ void AMDGPUInstPrinter::printSymbolicFormat(const MCInst *MI,
// \returns a low 256 vgpr representing a high vgpr \p Reg [v256..v1023] or
// \p Reg itself otherwise.
-static MCPhysReg getRegForPrinting(MCPhysReg Reg, const MCRegisterInfo &MRI) {
+static MCRegister getRegForPrinting(MCRegister Reg, const MCRegisterInfo &MRI) {
unsigned Enc = MRI.getEncodingValue(Reg);
unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK;
if (Idx < 0x100)
@@ -355,10 +356,10 @@ static MCPhysReg getRegForPrinting(MCPhysReg Reg, const MCRegisterInfo &MRI) {
}
// Restore MSBs of a VGPR above 255 from the MCInstrAnalysis.
-static MCPhysReg getRegFromMIA(MCPhysReg Reg, unsigned OpNo,
- const MCInstrDesc &Desc,
- const MCRegisterInfo &MRI,
- const AMDGPUMCInstrAnalysis &MIA) {
+static MCRegister getRegFromMIA(MCRegister Reg, unsigned OpNo,
+ const MCInstrDesc &Desc,
+ const MCRegisterInfo &MRI,
+ const AMDGPUMCInstrAnalysis &MIA) {
unsigned VgprMSBs = MIA.getVgprMSBs();
if (!VgprMSBs)
return Reg;
@@ -403,10 +404,10 @@ void AMDGPUInstPrinter::printRegOperand(MCRegister Reg, raw_ostream &O,
}
#endif
- unsigned PrintReg = getRegForPrinting(Reg, MRI);
+ MCRegister PrintReg = getRegForPrinting(Reg, MRI);
O << getRegisterName(PrintReg);
- if (PrintReg != Reg.id())
+ if (PrintReg != Reg)
O << " /*" << getRegisterName(Reg) << "*/";
}
@@ -490,6 +491,18 @@ void AMDGPUInstPrinter::printVINTRPDst(const MCInst *MI, unsigned OpNo,
printRegularOperand(MI, OpNo, STI, O);
}
+void AMDGPUInstPrinter::printAVLdSt32Align2RegOp(const MCInst *MI,
+ unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ MCRegister Reg = MI->getOperand(OpNo).getReg();
+
+ // On targets with an even alignment requirement
+ if (MCRegister SubReg = MRI.getSubReg(Reg, AMDGPU::sub0))
+ Reg = SubReg;
+ printRegOperand(Reg, O, MRI);
+}
+
void AMDGPUInstPrinter::printImmediateInt16(uint32_t Imm,
const MCSubtargetInfo &STI,
raw_ostream &O) {
@@ -795,14 +808,24 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo,
// Intention: print disassembler message when invalid code is decoded,
// for example sgpr register used in VReg or VISrc(VReg or imm) operand.
const MCOperandInfo &OpInfo = Desc.operands()[OpNo];
- int16_t RCID = MII.getOpRegClassID(
- OpInfo, STI.getHwMode(MCSubtargetInfo::HwMode_RegInfo));
- if (RCID != -1) {
+ if (OpInfo.RegClass != -1) {
+ int16_t RCID = MII.getOpRegClassID(
+ OpInfo, STI.getHwMode(MCSubtargetInfo::HwMode_RegInfo));
const MCRegisterClass &RC = MRI.getRegClass(RCID);
auto Reg = mc2PseudoReg(Op.getReg());
if (!RC.contains(Reg) && !isInlineValue(Reg)) {
- O << "/*Invalid register, operand has \'" << MRI.getRegClassName(&RC)
- << "\' register class*/";
+ bool IsWaveSizeOp = OpInfo.isLookupRegClassByHwMode() &&
+ (OpInfo.RegClass == AMDGPU::SReg_1 ||
+ OpInfo.RegClass == AMDGPU::SReg_1_XEXEC);
+ // Suppress this comment for a mismatched wavesize. Some users expect to
+ // be able to assemble and disassemble modules with mixed wavesizes, but
+ // we do not know the subtarget in different functions in MC.
+ //
+ // TODO: Should probably print it anyway, maybe a more specific version.
+ if (!IsWaveSizeOp) {
+ O << "/*Invalid register, operand has \'" << MRI.getRegClassName(&RC)
+ << "\' register class*/";
+ }
}
}
} else if (Op.isImm()) {
@@ -1331,12 +1354,9 @@ void AMDGPUInstPrinter::printPackedModifier(const MCInst *MI,
return;
O << Name;
- for (int I = 0; I < NumOps; ++I) {
- if (I != 0)
- O << ',';
-
- O << !!(Ops[I] & Mod);
- }
+ ListSeparator Sep(",");
+ for (int I = 0; I < NumOps; ++I)
+ O << Sep << !!(Ops[I] & Mod);
if (HasDstSel) {
O << ',' << !!(Ops[0] & SISrcMods::DST_OP_SEL);
@@ -1574,14 +1594,10 @@ void AMDGPUInstPrinter::printGPRIdxMode(const MCInst *MI, unsigned OpNo,
O << formatHex(static_cast<uint64_t>(Val));
} else {
O << "gpr_idx(";
- bool NeedComma = false;
+ ListSeparator Sep(",");
for (unsigned ModeId = ID_MIN; ModeId <= ID_MAX; ++ModeId) {
- if (Val & (1 << ModeId)) {
- if (NeedComma)
- O << ',';
- O << IdSymbolic[ModeId];
- NeedComma = true;
- }
+ if (Val & (1 << ModeId))
+ O << Sep << IdSymbolic[ModeId];
}
O << ')';
}
@@ -1788,25 +1804,16 @@ void AMDGPUInstPrinter::printSWaitCnt(const MCInst *MI, unsigned OpNo,
bool IsDefaultLgkmcnt = Lgkmcnt == getLgkmcntBitMask(ISA);
bool PrintAll = IsDefaultVmcnt && IsDefaultExpcnt && IsDefaultLgkmcnt;
- bool NeedSpace = false;
+ ListSeparator Sep(" ");
- if (!IsDefaultVmcnt || PrintAll) {
- O << "vmcnt(" << Vmcnt << ')';
- NeedSpace = true;
- }
+ if (!IsDefaultVmcnt || PrintAll)
+ O << Sep << "vmcnt(" << Vmcnt << ')';
- if (!IsDefaultExpcnt || PrintAll) {
- if (NeedSpace)
- O << ' ';
- O << "expcnt(" << Expcnt << ')';
- NeedSpace = true;
- }
+ if (!IsDefaultExpcnt || PrintAll)
+ O << Sep << "expcnt(" << Expcnt << ')';
- if (!IsDefaultLgkmcnt || PrintAll) {
- if (NeedSpace)
- O << ' ';
- O << "lgkmcnt(" << Lgkmcnt << ')';
- }
+ if (!IsDefaultLgkmcnt || PrintAll)
+ O << Sep << "lgkmcnt(" << Lgkmcnt << ')';
}
void AMDGPUInstPrinter::printDepCtr(const MCInst *MI, unsigned OpNo,
@@ -1822,14 +1829,10 @@ void AMDGPUInstPrinter::printDepCtr(const MCInst *MI, unsigned OpNo,
StringRef Name;
unsigned Val;
bool IsDefault;
- bool NeedSpace = false;
+ ListSeparator Sep(" ");
while (decodeDepCtr(Imm16, Id, Name, Val, IsDefault, STI)) {
- if (!IsDefault || !HasNonDefaultVal) {
- if (NeedSpace)
- O << ' ';
- O << Name << '(' << Val << ')';
- NeedSpace = true;
- }
+ if (!IsDefault || !HasNonDefaultVal)
+ O << Sep << Name << '(' << Val << ')';
}
} else {
O << formatHex(Imm16);
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index b27295e..564d6ee 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -77,6 +77,9 @@ private:
raw_ostream &O);
void printVINTRPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
+ void printAVLdSt32Align2RegOp(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+
void printImmediateInt16(uint32_t Imm, const MCSubtargetInfo &STI,
raw_ostream &O);
void printImmediateBF16(uint32_t Imm, const MCSubtargetInfo &STI,
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
index c27be02..093c85e 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
@@ -7,9 +7,7 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUMCExpr.h"
-#include "GCNSubtarget.h"
#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/IR/Function.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCAssembler.h"
#include "llvm/MC/MCContext.h"
@@ -317,30 +315,6 @@ const AMDGPUMCExpr *AMDGPUMCExpr::createTotalNumVGPR(const MCExpr *NumAGPR,
return create(AGVK_TotalNumVGPRs, {NumAGPR, NumVGPR}, Ctx);
}
-/// Mimics GCNSubtarget::computeOccupancy for MCExpr.
-///
-/// Remove dependency on GCNSubtarget and depend only only the necessary values
-/// for said occupancy computation. Should match computeOccupancy implementation
-/// without passing \p STM on.
-const AMDGPUMCExpr *AMDGPUMCExpr::createOccupancy(
- unsigned InitOcc, const MCExpr *NumSGPRs, const MCExpr *NumVGPRs,
- unsigned DynamicVGPRBlockSize, const GCNSubtarget &STM, MCContext &Ctx) {
- unsigned MaxWaves = IsaInfo::getMaxWavesPerEU(&STM);
- unsigned Granule = IsaInfo::getVGPRAllocGranule(&STM, DynamicVGPRBlockSize);
- unsigned TargetTotalNumVGPRs = IsaInfo::getTotalNumVGPRs(&STM);
- unsigned Generation = STM.getGeneration();
-
- auto CreateExpr = [&Ctx](unsigned Value) {
- return MCConstantExpr::create(Value, Ctx);
- };
-
- return create(AGVK_Occupancy,
- {CreateExpr(MaxWaves), CreateExpr(Granule),
- CreateExpr(TargetTotalNumVGPRs), CreateExpr(Generation),
- CreateExpr(InitOcc), NumSGPRs, NumVGPRs},
- Ctx);
-}
-
const AMDGPUMCExpr *AMDGPUMCExpr::createLit(LitModifier Lit, int64_t Value,
MCContext &Ctx) {
assert(Lit == LitModifier::Lit || Lit == LitModifier::Lit64);
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
index 246a3f8..bf7b40b 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
@@ -98,11 +98,6 @@ public:
return create(VariantKind::AGVK_AlignTo, {Value, Align}, Ctx);
}
- static const AMDGPUMCExpr *
- createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs,
- const MCExpr *NumVGPRs, unsigned DynamicVGPRBlockSize,
- const GCNSubtarget &STM, MCContext &Ctx);
-
static const AMDGPUMCExpr *createLit(LitModifier Lit, int64_t Value,
MCContext &Ctx);
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index 013cfeb..28b4da8 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -168,7 +168,7 @@ bool AMDGPUMCInstrAnalysis::evaluateBranch(const MCInst &Inst, uint64_t Addr,
void AMDGPUMCInstrAnalysis::updateState(const MCInst &Inst, uint64_t Addr) {
if (Inst.getOpcode() == AMDGPU::S_SET_VGPR_MSB_gfx12)
- VgprMSBs = Inst.getOperand(0).getImm();
+ VgprMSBs = Inst.getOperand(0).getImm() & 0xff;
else if (isTerminator(Inst))
VgprMSBs = 0;
}
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 5a08573..0855d6d 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -302,9 +302,9 @@ void AMDGPUTargetAsmStreamer::EmitMCResourceInfo(
#undef PRINT_RES_INFO
}
-void AMDGPUTargetAsmStreamer::EmitMCResourceMaximums(const MCSymbol *MaxVGPR,
- const MCSymbol *MaxAGPR,
- const MCSymbol *MaxSGPR) {
+void AMDGPUTargetAsmStreamer::EmitMCResourceMaximums(
+ const MCSymbol *MaxVGPR, const MCSymbol *MaxAGPR, const MCSymbol *MaxSGPR,
+ const MCSymbol *MaxNamedBarrier) {
#define PRINT_RES_INFO(ARG) \
OS << "\t.set "; \
ARG->print(OS, getContext().getAsmInfo()); \
@@ -315,6 +315,7 @@ void AMDGPUTargetAsmStreamer::EmitMCResourceMaximums(const MCSymbol *MaxVGPR,
PRINT_RES_INFO(MaxVGPR);
PRINT_RES_INFO(MaxAGPR);
PRINT_RES_INFO(MaxSGPR);
+ PRINT_RES_INFO(MaxNamedBarrier);
#undef PRINT_RES_INFO
}
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index 22afcde..3a0d8dc 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -69,7 +69,8 @@ public:
virtual void EmitMCResourceMaximums(const MCSymbol *MaxVGPR,
const MCSymbol *MaxAGPR,
- const MCSymbol *MaxSGPR) {};
+ const MCSymbol *MaxSGPR,
+ const MCSymbol *MaxNamedBarrier) {};
/// \returns True on success, false on failure.
virtual bool EmitISAVersion() { return true; }
@@ -149,7 +150,8 @@ public:
const MCSymbol *HasRecursion, const MCSymbol *HasIndirectCall) override;
void EmitMCResourceMaximums(const MCSymbol *MaxVGPR, const MCSymbol *MaxAGPR,
- const MCSymbol *MaxSGPR) override;
+ const MCSymbol *MaxSGPR,
+ const MCSymbol *MaxNamedBarrier) override;
/// \returns True on success, false on failure.
bool EmitISAVersion() override;
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index d950131..65dce74 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -2116,8 +2116,10 @@ class VIMAGE_TENSOR_Real <bits<8> op, VIMAGE_TENSOR_Pseudo ps, string opName = p
let vaddr2 = !if(ps.UpTo2D, !cast<int>(SGPR_NULL_gfx11plus.HWEncoding), ?);
let vaddr3 = !if(ps.UpTo2D, !cast<int>(SGPR_NULL_gfx11plus.HWEncoding), ?);
+ // Set VADDR4 to NULL
+ let vaddr4 = !cast<int>(SGPR_NULL_gfx11plus.HWEncoding);
+
// set to 0 based on SPG.
- let vaddr4 = 0;
let rsrc = 0;
let vdata = 0;
let d16 = 0;
diff --git a/llvm/lib/Target/AMDGPU/R600.td b/llvm/lib/Target/AMDGPU/R600.td
index 9148edb..bdfaac9 100644
--- a/llvm/lib/Target/AMDGPU/R600.td
+++ b/llvm/lib/Target/AMDGPU/R600.td
@@ -8,15 +8,6 @@
include "llvm/Target/Target.td"
-def R600InstrInfo : InstrInfo {
- let guessInstructionProperties = 1;
-}
-
-def R600 : Target {
- let InstructionSet = R600InstrInfo;
- let AllowRegisterRenaming = 1;
-}
-
let Namespace = "R600" in {
foreach Index = 0-15 in {
@@ -27,6 +18,18 @@ include "R600RegisterInfo.td"
}
+defm : RemapAllTargetPseudoPointerOperands<R600_Addr>;
+
+def R600InstrInfo : InstrInfo {
+ let guessInstructionProperties = 1;
+}
+
+def R600 : Target {
+ let InstructionSet = R600InstrInfo;
+ let AllowRegisterRenaming = 1;
+}
+
+
def NullALU : InstrItinClass;
def ALU_NULL : FuncUnit;
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index 2aa54c9..3c4f115 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -13,6 +13,7 @@
#include "R600ISelLowering.h"
#include "AMDGPU.h"
+#include "AMDGPUSelectionDAGInfo.h"
#include "MCTargetDesc/R600MCTargetDesc.h"
#include "R600Defines.h"
#include "R600MachineFunctionInfo.h"
@@ -29,7 +30,8 @@ using namespace llvm;
R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
const R600Subtarget &STI)
- : AMDGPUTargetLowering(TM, STI), Subtarget(&STI), Gen(STI.getGeneration()) {
+ : AMDGPUTargetLowering(TM, STI, STI), Subtarget(&STI),
+ Gen(STI.getGeneration()) {
addRegisterClass(MVT::f32, &R600::R600_Reg32RegClass);
addRegisterClass(MVT::i32, &R600::R600_Reg32RegClass);
addRegisterClass(MVT::v2f32, &R600::R600_Reg64RegClass);
@@ -1129,12 +1131,9 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
if ((AS == AMDGPUAS::PRIVATE_ADDRESS) && TruncatingStore) {
// Add an extra level of chain to isolate this vector
SDValue NewChain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, Chain);
- // TODO: can the chain be replaced without creating a new store?
- SDValue NewStore = DAG.getTruncStore(
- NewChain, DL, Value, Ptr, StoreNode->getPointerInfo(), MemVT,
- StoreNode->getAlign(), StoreNode->getMemOperand()->getFlags(),
- StoreNode->getAAInfo());
- StoreNode = cast<StoreSDNode>(NewStore);
+ SmallVector<SDValue, 4> NewOps(StoreNode->ops());
+ NewOps[0] = NewChain;
+ StoreNode = cast<StoreSDNode>(DAG.UpdateNodeOperands(StoreNode, NewOps));
}
return scalarizeVectorStore(StoreNode, DAG);
@@ -2186,6 +2185,8 @@ R600TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
case AtomicRMWInst::FSub:
case AtomicRMWInst::FMax:
case AtomicRMWInst::FMin:
+ case AtomicRMWInst::USubCond:
+ case AtomicRMWInst::USubSat:
return AtomicExpansionKind::CmpXChg;
case AtomicRMWInst::UIncWrap:
case AtomicRMWInst::UDecWrap:
diff --git a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
index 3e256cc..7f805e6 100644
--- a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -29,7 +29,7 @@ using namespace llvm;
#include "R600GenInstrInfo.inc"
R600InstrInfo::R600InstrInfo(const R600Subtarget &ST)
- : R600GenInstrInfo(ST, -1, -1), RI(), ST(ST) {}
+ : R600GenInstrInfo(ST, RI, -1, -1), RI(), ST(ST) {}
bool R600InstrInfo::isVector(const MachineInstr &MI) const {
return get(MI.getOpcode()).TSFlags & R600_InstFlag::VECTOR;
@@ -176,7 +176,7 @@ bool R600InstrInfo::usesVertexCache(unsigned Opcode) const {
}
bool R600InstrInfo::usesVertexCache(const MachineInstr &MI) const {
- const MachineFunction *MF = MI.getParent()->getParent();
+ const MachineFunction *MF = MI.getMF();
return !AMDGPU::isCompute(MF->getFunction().getCallingConv()) &&
usesVertexCache(MI.getOpcode());
}
@@ -186,7 +186,7 @@ bool R600InstrInfo::usesTextureCache(unsigned Opcode) const {
}
bool R600InstrInfo::usesTextureCache(const MachineInstr &MI) const {
- const MachineFunction *MF = MI.getParent()->getParent();
+ const MachineFunction *MF = MI.getMF();
return (AMDGPU::isCompute(MF->getFunction().getCallingConv()) &&
usesVertexCache(MI.getOpcode())) ||
usesTextureCache(MI.getOpcode());
@@ -948,7 +948,7 @@ bool R600InstrInfo::PredicateInstruction(MachineInstr &MI,
.setReg(Pred[2].getReg());
MI.getOperand(getOperandIdx(MI, R600::OpName::pred_sel_W))
.setReg(Pred[2].getReg());
- MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
+ MachineInstrBuilder MIB(*MI.getMF(), MI);
MIB.addReg(R600::PREDICATE_BIT, RegState::Implicit);
return true;
}
@@ -956,7 +956,7 @@ bool R600InstrInfo::PredicateInstruction(MachineInstr &MI,
if (PIdx != -1) {
MachineOperand &PMO = MI.getOperand(PIdx);
PMO.setReg(Pred[2].getReg());
- MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
+ MachineInstrBuilder MIB(*MI.getMF(), MI);
MIB.addReg(R600::PREDICATE_BIT, RegState::Implicit);
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp b/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp
index 48b4e7f..ac6508c 100644
--- a/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp
@@ -55,7 +55,7 @@ void R600AsmPrinter::emitInstruction(const MachineInstr *MI) {
StringRef Err;
if (!STI.getInstrInfo()->verifyInstruction(*MI, Err)) {
- LLVMContext &C = MI->getParent()->getParent()->getFunction().getContext();
+ LLVMContext &C = MI->getMF()->getFunction().getContext();
C.emitError("Illegal instruction detected: " + Err);
MI->print(errs());
}
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index b7a92a0..0d206ab 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -523,6 +523,7 @@ enum Id { // HwRegCode, (6) [5:0]
ID_HW_ID1 = 23,
ID_HW_ID2 = 24,
ID_POPS_PACKER = 25,
+ ID_SCHED_MODE = 26,
ID_PERF_SNAPSHOT_DATA_gfx11 = 27,
ID_IB_STS2 = 28,
ID_SHADER_CYCLES = 29,
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 7793907..39a6a77 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -238,7 +238,7 @@ static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC,
static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI,
const SIRegisterInfo *TRI,
const SIInstrInfo *TII) {
- MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
auto &Src = MI.getOperand(1);
Register DstReg = MI.getOperand(0).getReg();
Register SrcReg = Src.getReg();
@@ -856,8 +856,8 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
}
}
- if (TRI->isVectorRegister(*MRI, PHIRes) ||
- RC0 == &AMDGPU::VReg_1RegClass) {
+ if (TRI->hasVectorRegisters(MRI->getRegClass(PHIRes)) ||
+ RC0 == &AMDGPU::VReg_1RegClass) {
LLVM_DEBUG(dbgs() << "Legalizing PHI: " << MI);
TII->legalizeOperands(MI, MDT);
}
@@ -902,14 +902,28 @@ bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI,
// really much we can do to fix this.
// Some special instructions use M0 as an input. Some even only use
// the first lane. Insert a readfirstlane and hope for the best.
- if (DstReg == AMDGPU::M0 &&
- TRI->hasVectorRegisters(MRI->getRegClass(SrcReg))) {
+ const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg);
+ if (DstReg == AMDGPU::M0 && TRI->hasVectorRegisters(SrcRC)) {
Register TmpReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
- BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
- TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg)
+
+ const MCInstrDesc &ReadFirstLaneDesc =
+ TII->get(AMDGPU::V_READFIRSTLANE_B32);
+ BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), ReadFirstLaneDesc, TmpReg)
.add(MI.getOperand(1));
+
+ unsigned SubReg = MI.getOperand(1).getSubReg();
MI.getOperand(1).setReg(TmpReg);
+ MI.getOperand(1).setSubReg(AMDGPU::NoSubRegister);
+
+ const TargetRegisterClass *OpRC = TII->getRegClass(ReadFirstLaneDesc, 1);
+ const TargetRegisterClass *ConstrainRC =
+ SubReg == AMDGPU::NoSubRegister
+ ? OpRC
+ : TRI->getMatchingSuperRegClass(SrcRC, OpRC, SubReg);
+
+ if (!MRI->constrainRegClass(SrcReg, ConstrainRC))
+ llvm_unreachable("failed to constrain register");
} else if (tryMoveVGPRConstToSGPR(MI.getOperand(1), DstReg, MI.getParent(),
MI, MI.getDebugLoc())) {
I = std::next(I);
@@ -930,7 +944,7 @@ bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI,
// s_mov_b32.
if (isSafeToFoldImmIntoCopy(&MI, MRI->getVRegDef(SrcReg), TII, SMovOp, Imm)) {
MI.getOperand(1).ChangeToImmediate(Imm);
- MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
+ MI.addImplicitDefUseOperands(*MI.getMF());
MI.setDesc(TII->get(SMovOp));
return true;
}
@@ -1122,9 +1136,20 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
.addReg(VReg32);
} else if (SrcSize == 32) {
- auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
- TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg);
- MIB.addReg(SrcReg, 0, SubReg);
+ const MCInstrDesc &ReadFirstLaneDesc =
+ TII->get(AMDGPU::V_READFIRSTLANE_B32);
+ const TargetRegisterClass *OpRC = TII->getRegClass(ReadFirstLaneDesc, 1);
+ BuildMI(*MBB, MI, MI->getDebugLoc(), ReadFirstLaneDesc, DstReg)
+ .addReg(SrcReg, 0, SubReg);
+
+ const TargetRegisterClass *ConstrainRC =
+ SubReg == AMDGPU::NoSubRegister
+ ? OpRC
+ : TRI->getMatchingSuperRegClass(MRI->getRegClass(SrcReg), OpRC,
+ SubReg);
+
+ if (!MRI->constrainRegClass(SrcReg, ConstrainRC))
+ llvm_unreachable("failed to constrain register");
} else {
auto Result = BuildMI(*MBB, MI, MI->getDebugLoc(),
TII->get(AMDGPU::REG_SEQUENCE), DstReg);
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 6616b30..2df9267 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -681,6 +681,10 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
return false;
MI->setDesc(TII->get(NewMFMAOpc));
MI->untieRegOperand(0);
+ const MCInstrDesc &MCID = MI->getDesc();
+ for (unsigned I = 0; I < MI->getNumDefs(); ++I)
+ if (MCID.getOperandConstraint(I, MCOI::EARLY_CLOBBER) != -1)
+ MI->getOperand(I).setIsEarlyClobber(true);
}
// TODO: Should we try to avoid adding this to the candidate list?
@@ -709,7 +713,7 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
// Verify the register is compatible with the operand.
if (const TargetRegisterClass *OpRC =
- TII->getRegClass(MI->getDesc(), Fold.UseOpNo, TRI)) {
+ TII->getRegClass(MI->getDesc(), Fold.UseOpNo)) {
const TargetRegisterClass *NewRC =
TRI->getRegClassForReg(*MRI, New->getReg());
@@ -762,6 +766,29 @@ static void appendFoldCandidate(SmallVectorImpl<FoldCandidate> &FoldList,
FoldCandidate(MI, OpNo, FoldOp, Commuted, ShrinkOp));
}
+// Returns true if the instruction is a packed F32 instruction and the
+// corresponding scalar operand reads 32 bits and replicates the bits to both
+// channels.
+static bool isPKF32InstrReplicatesLower32BitsOfScalarOperand(
+ const GCNSubtarget *ST, MachineInstr *MI, unsigned OpNo) {
+ if (!ST->hasPKF32InstsReplicatingLower32BitsOfScalarInput())
+ return false;
+ const MCOperandInfo &OpDesc = MI->getDesc().operands()[OpNo];
+ return OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
+}
+
+// Packed FP32 instructions only read 32 bits from a scalar operand (SGPR or
+// literal) and replicates the bits to both channels. Therefore, if the hi and
+// lo are not same, we can't fold it.
+static bool checkImmOpForPKF32InstrReplicatesLower32BitsOfScalarOperand(
+ const FoldableDef &OpToFold) {
+ assert(OpToFold.isImm() && "Expected immediate operand");
+ uint64_t ImmVal = OpToFold.getEffectiveImmVal().value();
+ uint32_t Lo = Lo_32(ImmVal);
+ uint32_t Hi = Hi_32(ImmVal);
+ return Lo == Hi;
+}
+
bool SIFoldOperandsImpl::tryAddToFoldList(
SmallVectorImpl<FoldCandidate> &FoldList, MachineInstr *MI, unsigned OpNo,
const FoldableDef &OpToFold) const {
@@ -915,6 +942,13 @@ bool SIFoldOperandsImpl::tryAddToFoldList(
return true;
}
+ // Special case for PK_F32 instructions if we are trying to fold an imm to
+ // src0 or src1.
+ if (OpToFold.isImm() &&
+ isPKF32InstrReplicatesLower32BitsOfScalarOperand(ST, MI, OpNo) &&
+ !checkImmOpForPKF32InstrReplicatesLower32BitsOfScalarOperand(OpToFold))
+ return false;
+
appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
return true;
}
@@ -1129,40 +1163,14 @@ bool SIFoldOperandsImpl::tryToFoldACImm(
if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx))
return false;
- MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
if (OpToFold.isImm() && OpToFold.isOperandLegal(*TII, *UseMI, UseOpIdx)) {
+ if (isPKF32InstrReplicatesLower32BitsOfScalarOperand(ST, UseMI, UseOpIdx) &&
+ !checkImmOpForPKF32InstrReplicatesLower32BitsOfScalarOperand(OpToFold))
+ return false;
appendFoldCandidate(FoldList, UseMI, UseOpIdx, OpToFold);
return true;
}
- // TODO: Verify the following code handles subregisters correctly.
- // TODO: Handle extract of global reference
- if (UseOp.getSubReg())
- return false;
-
- if (!OpToFold.isReg())
- return false;
-
- Register UseReg = OpToFold.getReg();
- if (!UseReg.isVirtual())
- return false;
-
- // Maybe it is just a COPY of an immediate itself.
-
- // FIXME: Remove this handling. There is already special case folding of
- // immediate into copy in foldOperand. This is looking for the def of the
- // value the folding started from in the first place.
- MachineInstr *Def = MRI->getVRegDef(UseReg);
- if (Def && TII->isFoldableCopy(*Def)) {
- MachineOperand &DefOp = Def->getOperand(1);
- if (DefOp.isImm() && TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) {
- FoldableDef FoldableImm(DefOp.getImm(), OpToFold.DefRC,
- OpToFold.DefSubReg);
- appendFoldCandidate(FoldList, UseMI, UseOpIdx, FoldableImm);
- return true;
- }
- }
-
return false;
}
@@ -1309,10 +1317,11 @@ void SIFoldOperandsImpl::foldOperand(
continue;
const int SrcIdx = MovOp == AMDGPU::V_MOV_B16_t16_e64 ? 2 : 1;
- const TargetRegisterClass *MovSrcRC =
- TRI->getRegClass(TII->getOpRegClassID(MovDesc.operands()[SrcIdx]));
- if (MovSrcRC) {
+ int16_t RegClassID = TII->getOpRegClassID(MovDesc.operands()[SrcIdx]);
+ if (RegClassID != -1) {
+ const TargetRegisterClass *MovSrcRC = TRI->getRegClass(RegClassID);
+
if (UseSubReg)
MovSrcRC = TRI->getMatchingSuperRegClass(SrcRC, MovSrcRC, UseSubReg);
@@ -1351,7 +1360,7 @@ void SIFoldOperandsImpl::foldOperand(
if (MovOp == AMDGPU::V_MOV_B16_t16_e64) {
const auto &SrcOp = UseMI->getOperand(UseOpIdx);
MachineOperand NewSrcOp(SrcOp);
- MachineFunction *MF = UseMI->getParent()->getParent();
+ MachineFunction *MF = UseMI->getMF();
UseMI->removeOperand(1);
UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // src0_modifiers
UseMI->addOperand(NewSrcOp); // src0
@@ -1382,7 +1391,7 @@ void SIFoldOperandsImpl::foldOperand(
// Remove this if 16-bit SGPRs (i.e. SGPR_LO16) are added to the
// VS_16RegClass
//
- // Excerpt from AMDGPUGenRegisterInfo.inc
+ // Excerpt from AMDGPUGenRegisterInfoEnums.inc
// NoSubRegister, //0
// hi16, // 1
// lo16, // 2
@@ -1558,20 +1567,6 @@ static unsigned getMovOpc(bool IsScalar) {
return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
}
-static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) {
- MI.setDesc(NewDesc);
-
- // Remove any leftover implicit operands from mutating the instruction. e.g.
- // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
- // anymore.
- const MCInstrDesc &Desc = MI.getDesc();
- unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
- Desc.implicit_defs().size();
-
- for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
- MI.removeOperand(I);
-}
-
std::optional<int64_t>
SIFoldOperandsImpl::getImmOrMaterializedImm(MachineOperand &Op) const {
if (Op.isImm())
@@ -1610,7 +1605,8 @@ bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const {
Opc == AMDGPU::S_NOT_B32) &&
Src0Imm) {
MI->getOperand(1).ChangeToImmediate(~*Src0Imm);
- mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
+ TII->mutateAndCleanupImplicit(
+ *MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
return true;
}
@@ -1638,7 +1634,7 @@ bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const {
// instruction.
MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
MI->removeOperand(Src1Idx);
- mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR)));
+ TII->mutateAndCleanupImplicit(*MI, TII->get(getMovOpc(IsSGPR)));
return true;
}
@@ -1658,11 +1654,12 @@ bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const {
if (Src1Val == 0) {
// y = or x, 0 => y = copy x
MI->removeOperand(Src1Idx);
- mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
+ TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY));
} else if (Src1Val == -1) {
// y = or x, -1 => y = v_mov_b32 -1
MI->removeOperand(Src1Idx);
- mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
+ TII->mutateAndCleanupImplicit(
+ *MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
} else
return false;
@@ -1674,11 +1671,12 @@ bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const {
if (Src1Val == 0) {
// y = and x, 0 => y = v_mov_b32 0
MI->removeOperand(Src0Idx);
- mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
+ TII->mutateAndCleanupImplicit(
+ *MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
} else if (Src1Val == -1) {
// y = and x, -1 => y = copy x
MI->removeOperand(Src1Idx);
- mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
+ TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY));
} else
return false;
@@ -1690,7 +1688,7 @@ bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const {
if (Src1Val == 0) {
// y = xor x, 0 => y = copy x
MI->removeOperand(Src1Idx);
- mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
+ TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY));
return true;
}
}
@@ -1736,7 +1734,7 @@ bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const {
MI.removeOperand(Src1ModIdx);
if (Src0ModIdx != -1)
MI.removeOperand(Src0ModIdx);
- mutateCopyOp(MI, NewDesc);
+ TII->mutateAndCleanupImplicit(MI, NewDesc);
LLVM_DEBUG(dbgs() << MI);
return true;
}
@@ -1804,7 +1802,7 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
if (CopiesToReplace.empty() && FoldList.empty())
return Changed;
- MachineFunction *MF = MI.getParent()->getParent();
+ MachineFunction *MF = MI.getMF();
// Make sure we add EXEC uses to any new v_mov instructions created.
for (MachineInstr *Copy : CopiesToReplace)
Copy->addImplicitDefUseOperands(*MF);
@@ -2419,7 +2417,7 @@ bool SIFoldOperandsImpl::tryFoldRegSequence(MachineInstr &MI) {
unsigned OpIdx = Op - &UseMI->getOperand(0);
const MCInstrDesc &InstDesc = UseMI->getDesc();
- const TargetRegisterClass *OpRC = TII->getRegClass(InstDesc, OpIdx, TRI);
+ const TargetRegisterClass *OpRC = TII->getRegClass(InstDesc, OpIdx);
if (!OpRC || !TRI->isVectorSuperClass(OpRC))
return false;
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 5c39f7a..ec3e720 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -772,6 +772,17 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
PreloadedScratchRsrcReg,
ScratchRsrcReg, ScratchWaveOffsetReg);
}
+
+ if (ST.hasWaitXCnt()) {
+ // Set REPLAY_MODE (bit 25) in MODE register to enable multi-group XNACK
+ // replay. This aligns hardware behavior with the compiler's s_wait_xcnt
+ // insertion logic, which assumes multi-group mode by default.
+ unsigned RegEncoding =
+ AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 25, 1);
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
+ .addImm(1)
+ .addImm(RegEncoding);
+ }
}
// Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
@@ -1833,9 +1844,7 @@ void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
static void assignSlotsUsingVGPRBlocks(MachineFunction &MF,
const GCNSubtarget &ST,
- std::vector<CalleeSavedInfo> &CSI,
- unsigned &MinCSFrameIndex,
- unsigned &MaxCSFrameIndex) {
+ std::vector<CalleeSavedInfo> &CSI) {
SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
MachineFrameInfo &MFI = MF.getFrameInfo();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
@@ -1904,10 +1913,7 @@ static void assignSlotsUsingVGPRBlocks(MachineFunction &MF,
int FrameIdx =
MFI.CreateStackObject(BlockSize, TRI->getSpillAlign(*BlockRegClass),
/*isSpillSlot=*/true);
- if ((unsigned)FrameIdx < MinCSFrameIndex)
- MinCSFrameIndex = FrameIdx;
- if ((unsigned)FrameIdx > MaxCSFrameIndex)
- MaxCSFrameIndex = FrameIdx;
+ MFI.setIsCalleeSavedObjectIndex(FrameIdx, true);
CSIt->setFrameIdx(FrameIdx);
CSIt->setReg(RegBlock);
@@ -1917,8 +1923,7 @@ static void assignSlotsUsingVGPRBlocks(MachineFunction &MF,
bool SIFrameLowering::assignCalleeSavedSpillSlots(
MachineFunction &MF, const TargetRegisterInfo *TRI,
- std::vector<CalleeSavedInfo> &CSI, unsigned &MinCSFrameIndex,
- unsigned &MaxCSFrameIndex) const {
+ std::vector<CalleeSavedInfo> &CSI) const {
if (CSI.empty())
return true; // Early exit if no callee saved registers are modified!
@@ -1926,12 +1931,12 @@ bool SIFrameLowering::assignCalleeSavedSpillSlots(
bool UseVGPRBlocks = ST.useVGPRBlockOpsForCSR();
if (UseVGPRBlocks)
- assignSlotsUsingVGPRBlocks(MF, ST, CSI, MinCSFrameIndex, MaxCSFrameIndex);
+ assignSlotsUsingVGPRBlocks(MF, ST, CSI);
- return assignCalleeSavedSpillSlots(MF, TRI, CSI) || UseVGPRBlocks;
+ return assignCalleeSavedSpillSlotsImpl(MF, TRI, CSI) || UseVGPRBlocks;
}
-bool SIFrameLowering::assignCalleeSavedSpillSlots(
+bool SIFrameLowering::assignCalleeSavedSpillSlotsImpl(
MachineFunction &MF, const TargetRegisterInfo *TRI,
std::vector<CalleeSavedInfo> &CSI) const {
if (CSI.empty())
@@ -2170,7 +2175,9 @@ bool SIFrameLowering::hasFPImpl(const MachineFunction &MF) const {
return MFI.getStackSize() != 0;
}
- return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() ||
+ return (frameTriviallyRequiresSP(MFI) &&
+ !MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) ||
+ MFI.isFrameAddressTaken() ||
MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment(
MF) ||
mayReserveScratchForCWSR(MF) ||
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
index a727729..4c1cf3c 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -49,11 +49,9 @@ public:
const TargetRegisterInfo *TRI,
std::vector<CalleeSavedInfo> &CSI) const override;
- bool assignCalleeSavedSpillSlots(MachineFunction &MF,
- const TargetRegisterInfo *TRI,
- std::vector<CalleeSavedInfo> &CSI,
- unsigned &MinCSFrameIndex,
- unsigned &MaxCSFrameIndex) const override;
+ bool assignCalleeSavedSpillSlotsImpl(MachineFunction &MF,
+ const TargetRegisterInfo *TRI,
+ std::vector<CalleeSavedInfo> &CSI) const;
bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index b34ab2a..0f91b31 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -15,6 +15,7 @@
#include "AMDGPU.h"
#include "AMDGPUInstrInfo.h"
#include "AMDGPULaneMaskUtils.h"
+#include "AMDGPUSelectionDAGInfo.h"
#include "AMDGPUTargetMachine.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
@@ -34,6 +35,8 @@
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachinePassManager.h"
+#include "llvm/CodeGen/PseudoSourceValueManager.h"
#include "llvm/CodeGen/SDPatternMatch.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/IRBuilder.h"
@@ -86,69 +89,78 @@ static unsigned findFirstFreeSGPR(CCState &CCInfo) {
SITargetLowering::SITargetLowering(const TargetMachine &TM,
const GCNSubtarget &STI)
- : AMDGPUTargetLowering(TM, STI), Subtarget(&STI) {
+ : AMDGPUTargetLowering(TM, STI, STI), Subtarget(&STI) {
addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
- addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
+
+ const SIRegisterInfo *TRI = STI.getRegisterInfo();
+ const TargetRegisterClass *V32RegClass =
+ TRI->getDefaultVectorSuperClassForBitWidth(32);
+ addRegisterClass(MVT::f32, V32RegClass);
addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
- const SIRegisterInfo *TRI = STI.getRegisterInfo();
- const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
+ const TargetRegisterClass *V64RegClass =
+ TRI->getDefaultVectorSuperClassForBitWidth(64);
addRegisterClass(MVT::f64, V64RegClass);
addRegisterClass(MVT::v2f32, V64RegClass);
addRegisterClass(MVT::Untyped, V64RegClass);
addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
- addRegisterClass(MVT::v3f32, &AMDGPU::VReg_96RegClass);
+ addRegisterClass(MVT::v3f32, TRI->getDefaultVectorSuperClassForBitWidth(96));
addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
- addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
+ addRegisterClass(MVT::v4f32, TRI->getDefaultVectorSuperClassForBitWidth(128));
addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
- addRegisterClass(MVT::v5f32, &AMDGPU::VReg_160RegClass);
+ addRegisterClass(MVT::v5f32, TRI->getDefaultVectorSuperClassForBitWidth(160));
addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
- addRegisterClass(MVT::v6f32, &AMDGPU::VReg_192RegClass);
+ addRegisterClass(MVT::v6f32, TRI->getDefaultVectorSuperClassForBitWidth(192));
addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
- addRegisterClass(MVT::v3f64, &AMDGPU::VReg_192RegClass);
+ addRegisterClass(MVT::v3f64, TRI->getDefaultVectorSuperClassForBitWidth(192));
addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
- addRegisterClass(MVT::v7f32, &AMDGPU::VReg_224RegClass);
+ addRegisterClass(MVT::v7f32, TRI->getDefaultVectorSuperClassForBitWidth(224));
addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
- addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
+ addRegisterClass(MVT::v8f32, TRI->getDefaultVectorSuperClassForBitWidth(256));
addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
- addRegisterClass(MVT::v4f64, &AMDGPU::VReg_256RegClass);
+ addRegisterClass(MVT::v4f64, TRI->getDefaultVectorSuperClassForBitWidth(256));
addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
- addRegisterClass(MVT::v9f32, &AMDGPU::VReg_288RegClass);
+ addRegisterClass(MVT::v9f32, TRI->getDefaultVectorSuperClassForBitWidth(288));
addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
- addRegisterClass(MVT::v10f32, &AMDGPU::VReg_320RegClass);
+ addRegisterClass(MVT::v10f32,
+ TRI->getDefaultVectorSuperClassForBitWidth(320));
addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
- addRegisterClass(MVT::v11f32, &AMDGPU::VReg_352RegClass);
+ addRegisterClass(MVT::v11f32,
+ TRI->getDefaultVectorSuperClassForBitWidth(352));
addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
- addRegisterClass(MVT::v12f32, &AMDGPU::VReg_384RegClass);
+ addRegisterClass(MVT::v12f32,
+ TRI->getDefaultVectorSuperClassForBitWidth(384));
addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
- addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
+ addRegisterClass(MVT::v16f32,
+ TRI->getDefaultVectorSuperClassForBitWidth(512));
addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
- addRegisterClass(MVT::v8f64, &AMDGPU::VReg_512RegClass);
+ addRegisterClass(MVT::v8f64, TRI->getDefaultVectorSuperClassForBitWidth(512));
addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
- addRegisterClass(MVT::v16f64, &AMDGPU::VReg_1024RegClass);
+ addRegisterClass(MVT::v16f64,
+ TRI->getDefaultVectorSuperClassForBitWidth(1024));
if (Subtarget->has16BitInsts()) {
if (Subtarget->useRealTrue16Insts()) {
@@ -180,7 +192,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
}
addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
- addRegisterClass(MVT::v32f32, &AMDGPU::VReg_1024RegClass);
+ addRegisterClass(MVT::v32f32,
+ TRI->getDefaultVectorSuperClassForBitWidth(1024));
computeRegisterProperties(Subtarget->getRegisterInfo());
@@ -1047,6 +1060,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
ISD::ATOMIC_LOAD_FMAX,
ISD::ATOMIC_LOAD_UINC_WRAP,
ISD::ATOMIC_LOAD_UDEC_WRAP,
+ ISD::ATOMIC_LOAD_USUB_COND,
+ ISD::ATOMIC_LOAD_USUB_SAT,
ISD::INTRINSIC_VOID,
ISD::INTRINSIC_W_CHAIN});
@@ -1297,7 +1312,7 @@ static unsigned getIntrMemWidth(unsigned IntrID) {
}
}
-static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad,
+static void getCoopAtomicOperandsInfo(const CallBase &CI, bool IsLoad,
TargetLoweringBase::IntrinsicInfo &Info) {
Value *OrderingArg = CI.getArgOperand(IsLoad ? 1 : 2);
unsigned Ord = cast<ConstantInt>(OrderingArg)->getZExtValue();
@@ -1327,7 +1342,7 @@ static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad,
}
bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
- const CallInst &CI,
+ const CallBase &CI,
MachineFunction &MF,
unsigned IntrID) const {
Info.flags = MachineMemOperand::MONone;
@@ -1507,15 +1522,6 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
return true;
}
- case Intrinsic::amdgcn_global_atomic_csub: {
- Info.opc = ISD::INTRINSIC_W_CHAIN;
- Info.memVT = MVT::getVT(CI.getType());
- Info.ptrVal = CI.getOperand(0);
- Info.align.reset();
- Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
- MachineMemOperand::MOVolatile;
- return true;
- }
case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
case Intrinsic::amdgcn_image_bvh_intersect_ray:
case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
@@ -1536,8 +1542,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case Intrinsic::amdgcn_global_atomic_fmax_num:
case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
case Intrinsic::amdgcn_flat_atomic_fmin_num:
- case Intrinsic::amdgcn_flat_atomic_fmax_num:
- case Intrinsic::amdgcn_atomic_cond_sub_u32: {
+ case Intrinsic::amdgcn_flat_atomic_fmax_num: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(CI.getType());
Info.ptrVal = CI.getOperand(0);
@@ -1712,7 +1717,6 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
Type *&AccessTy) const {
Value *Ptr = nullptr;
switch (II->getIntrinsicID()) {
- case Intrinsic::amdgcn_atomic_cond_sub_u32:
case Intrinsic::amdgcn_cluster_load_b128:
case Intrinsic::amdgcn_cluster_load_b64:
case Intrinsic::amdgcn_cluster_load_b32:
@@ -1735,7 +1739,6 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
case Intrinsic::amdgcn_flat_load_monitor_b128:
case Intrinsic::amdgcn_flat_load_monitor_b32:
case Intrinsic::amdgcn_flat_load_monitor_b64:
- case Intrinsic::amdgcn_global_atomic_csub:
case Intrinsic::amdgcn_global_atomic_fmax_num:
case Intrinsic::amdgcn_global_atomic_fmin_num:
case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
@@ -2254,6 +2257,14 @@ bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
return TargetLowering::isTypeDesirableForOp(Op, VT);
}
+MachinePointerInfo
+SITargetLowering::getKernargSegmentPtrInfo(MachineFunction &MF) const {
+ // This isn't really a constant pool but close enough.
+ MachinePointerInfo PtrInfo(MF.getPSVManager().getConstantPool());
+ PtrInfo.AddrSpace = AMDGPUAS::CONSTANT_ADDRESS;
+ return PtrInfo;
+}
+
SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
const SDLoc &SL,
SDValue Chain,
@@ -2330,7 +2341,9 @@ SDValue SITargetLowering::lowerKernargMemParameter(
SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
uint64_t Offset, Align Alignment, bool Signed,
const ISD::InputArg *Arg) const {
- MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
+
+ MachinePointerInfo PtrInfo =
+ getKernargSegmentPtrInfo(DAG.getMachineFunction());
// Try to avoid using an extload by loading earlier than the argument address,
// and extracting the relevant bits. The load should hopefully be merged with
@@ -2345,7 +2358,8 @@ SDValue SITargetLowering::lowerKernargMemParameter(
// TODO: If we passed in the base kernel offset we could have a better
// alignment than 4, but we don't really need it.
SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
- SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
+ SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr,
+ PtrInfo.getWithOffset(AlignDownOffset), Align(4),
MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant);
@@ -2360,9 +2374,9 @@ SDValue SITargetLowering::lowerKernargMemParameter(
}
SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
- SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
- MachineMemOperand::MODereferenceable |
- MachineMemOperand::MOInvariant);
+ SDValue Load = DAG.getLoad(
+ MemVT, SL, Chain, Ptr, PtrInfo.getWithOffset(Offset), Alignment,
+ MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant);
SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
@@ -3562,11 +3576,17 @@ SDValue SITargetLowering::LowerFormalArguments(
if (IsEntryFunc)
allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
- // DAG.getPass() returns nullptr when using new pass manager.
- // TODO: Use DAG.getMFAM() to access analysis result.
if (DAG.getPass()) {
- auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
- ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
+ auto &ArgUsageInfo =
+ DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfoWrapperLegacy>();
+ ArgUsageInfo.getArgUsageInfo().setFuncArgInfo(Fn, Info->getArgInfo());
+ } else if (auto *MFAM = DAG.getMFAM()) {
+ Module &M = *MF.getFunction().getParent();
+ auto *ArgUsageInfo =
+ MFAM->getResult<ModuleAnalysisManagerMachineFunctionProxy>(MF)
+ .getCachedResult<AMDGPUArgumentUsageAnalysis>(M);
+ if (ArgUsageInfo)
+ ArgUsageInfo->setFuncArgInfo(Fn, Info->getArgInfo());
}
unsigned StackArgSize = CCInfo.getStackSize();
@@ -3781,12 +3801,19 @@ void SITargetLowering::passSpecialInputs(
const AMDGPUFunctionArgInfo *CalleeArgInfo =
&AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
- // DAG.getPass() returns nullptr when using new pass manager.
- // TODO: Use DAG.getMFAM() to access analysis result.
if (DAG.getPass()) {
auto &ArgUsageInfo =
- DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
- CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
+ DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfoWrapperLegacy>();
+ CalleeArgInfo =
+ &ArgUsageInfo.getArgUsageInfo().lookupFuncArgInfo(*CalleeFunc);
+ } else if (auto *MFAM = DAG.getMFAM()) {
+ Module &M = *DAG.getMachineFunction().getFunction().getParent();
+ auto *ArgUsageInfo =
+ MFAM->getResult<ModuleAnalysisManagerMachineFunctionProxy>(
+ DAG.getMachineFunction())
+ .getCachedResult<AMDGPUArgumentUsageAnalysis>(M);
+ if (ArgUsageInfo)
+ CalleeArgInfo = &ArgUsageInfo->lookupFuncArgInfo(*CalleeFunc);
}
}
@@ -4052,7 +4079,7 @@ bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
if (!CI->isTailCall())
return false;
- const Function *ParentFn = CI->getParent()->getParent();
+ const Function *ParentFn = CI->getFunction();
if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
return false;
return true;
@@ -5469,6 +5496,10 @@ static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
return std::numeric_limits<uint32_t>::min();
case AMDGPU::S_MAX_I32:
return std::numeric_limits<int32_t>::min();
+ case AMDGPU::V_ADD_F32_e64: // -0.0
+ return 0x80000000;
+ case AMDGPU::V_SUB_F32_e64: // +0.0
+ return 0x0;
case AMDGPU::S_ADD_I32:
case AMDGPU::S_SUB_I32:
case AMDGPU::S_OR_B32:
@@ -5476,6 +5507,9 @@ static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
return std::numeric_limits<uint32_t>::min();
case AMDGPU::S_AND_B32:
return std::numeric_limits<uint32_t>::max();
+ case AMDGPU::V_MIN_F32_e64:
+ case AMDGPU::V_MAX_F32_e64:
+ return 0x7fc00000; // qNAN
default:
llvm_unreachable(
"Unexpected opcode in getIdentityValueFor32BitWaveReduction");
@@ -5510,7 +5544,14 @@ static bool is32bitWaveReduceOperation(unsigned Opc) {
Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 ||
Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
- Opc == AMDGPU::S_XOR_B32;
+ Opc == AMDGPU::S_XOR_B32 || Opc == AMDGPU::V_MIN_F32_e64 ||
+ Opc == AMDGPU::V_MAX_F32_e64 || Opc == AMDGPU::V_ADD_F32_e64 ||
+ Opc == AMDGPU::V_SUB_F32_e64;
+}
+
+static bool isFloatingPointWaveReduceOperation(unsigned Opc) {
+ return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64 ||
+ Opc == AMDGPU::V_ADD_F32_e64 || Opc == AMDGPU::V_SUB_F32_e64;
}
static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
@@ -5531,8 +5572,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
switch (Opc) {
case AMDGPU::S_MIN_U32:
case AMDGPU::S_MIN_I32:
+ case AMDGPU::V_MIN_F32_e64:
case AMDGPU::S_MAX_U32:
case AMDGPU::S_MAX_I32:
+ case AMDGPU::V_MAX_F32_e64:
case AMDGPU::S_AND_B32:
case AMDGPU::S_OR_B32: {
// Idempotent operations.
@@ -5555,8 +5598,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
case AMDGPU::S_XOR_B64:
case AMDGPU::S_ADD_I32:
case AMDGPU::S_ADD_U64_PSEUDO:
+ case AMDGPU::V_ADD_F32_e64:
case AMDGPU::S_SUB_I32:
- case AMDGPU::S_SUB_U64_PSEUDO: {
+ case AMDGPU::S_SUB_U64_PSEUDO:
+ case AMDGPU::V_SUB_F32_e64: {
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
@@ -5711,6 +5756,30 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
.addImm(AMDGPU::sub1);
break;
}
+ case AMDGPU::V_ADD_F32_e64:
+ case AMDGPU::V_SUB_F32_e64: {
+ Register ActiveLanesVreg =
+ MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register DstVreg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ // Get number of active lanes as a float val.
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::V_CVT_F32_I32_e64),
+ ActiveLanesVreg)
+ .addReg(NewAccumulator->getOperand(0).getReg())
+ .addImm(0) // clamp
+ .addImm(0); // output-modifier
+
+ // Take negation of input for SUB reduction
+ unsigned srcMod = Opc == AMDGPU::V_SUB_F32_e64 ? 1 : 0;
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::V_MUL_F32_e64), DstVreg)
+ .addImm(srcMod) // src0 modifier
+ .addReg(SrcReg)
+ .addImm(0) // src1 modifier
+ .addReg(ActiveLanesVreg)
+ .addImm(0) // clamp
+ .addImm(0); // output-mod
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
+ .addReg(DstVreg);
+ }
}
RetBB = &BB;
}
@@ -5728,6 +5797,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
MachineBasicBlock::iterator I = BB.end();
Register SrcReg = MI.getOperand(1).getReg();
bool is32BitOpc = is32bitWaveReduceOperation(Opc);
+ bool isFPOp = isFloatingPointWaveReduceOperation(Opc);
// Create Control flow for loop
// Split MI's Machine Basic block into For loop
@@ -5787,9 +5857,29 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
LaneValueReg)
.addReg(SrcReg)
.addReg(FF1Reg);
- NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
- .addReg(Accumulator->getOperand(0).getReg())
- .addReg(LaneValueReg);
+ if (isFPOp) {
+ Register LaneValVreg =
+ MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
+ Register DstVreg = MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
+ // Get the Lane Value in VGPR to avoid the Constant Bus Restriction
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_MOV_B32_e32),
+ LaneValVreg)
+ .addReg(LaneValueReg);
+ BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg)
+ .addImm(0) // src0 modifier
+ .addReg(Accumulator->getOperand(0).getReg())
+ .addImm(0) // src1 modifier
+ .addReg(LaneValVreg)
+ .addImm(0) // clamp
+ .addImm(0); // omod
+ NewAccumulator = BuildMI(*ComputeLoop, I, DL,
+ TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
+ .addReg(DstVreg);
+ } else {
+ NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
+ .addReg(Accumulator->getOperand(0).getReg())
+ .addReg(LaneValueReg);
+ }
} else {
Register LaneValueLoReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
@@ -5921,6 +6011,8 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
+ case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MIN_F32_e64);
case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
@@ -5929,14 +6021,20 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
+ case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MAX_F32_e64);
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
+ case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_ADD_F32_e64);
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
+ case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_SUB_F32_e64);
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
@@ -6347,8 +6445,6 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case AMDGPU::DS_GWS_INIT:
case AMDGPU::DS_GWS_SEMA_BR:
case AMDGPU::DS_GWS_BARRIER:
- TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
- [[fallthrough]];
case AMDGPU::DS_GWS_SEMA_V:
case AMDGPU::DS_GWS_SEMA_P:
case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
@@ -7035,9 +7131,15 @@ static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N,
SDLoc SL(N);
if (Src.getOpcode() == ISD::SETCC) {
+ SDValue Op0 = Src.getOperand(0);
+ SDValue Op1 = Src.getOperand(1);
+ // Need to expand bfloat to float for comparison (setcc).
+ if (Op0.getValueType() == MVT::bf16) {
+ Op0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op0);
+ Op1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op1);
+ }
// (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
- return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
- Src.getOperand(1), Src.getOperand(2));
+ return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Op0, Op1, Src.getOperand(2));
}
if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
// (ballot 0) -> 0
@@ -8057,10 +8159,11 @@ SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
MachineFunction &MF = DAG.getMachineFunction();
uint64_t Offset = getImplicitParameterOffset(MF, Param);
SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
- MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
- return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
- MachineMemOperand::MODereferenceable |
- MachineMemOperand::MOInvariant);
+ MachinePointerInfo PtrInfo =
+ getKernargSegmentPtrInfo(DAG.getMachineFunction());
+ return DAG.getLoad(
+ VT, DL, DAG.getEntryNode(), Ptr, PtrInfo.getWithOffset(Offset), Alignment,
+ MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant);
}
SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
@@ -8322,6 +8425,9 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
Op.getValueType() == MVT::i64) {
const SIMachineFunctionInfo *Info =
DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
+ if (Info->get32BitAddressHighBits() == 0)
+ return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, Src);
+
SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
@@ -9731,7 +9837,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
}
case Intrinsic::amdgcn_kernarg_segment_ptr: {
- if (!AMDGPU::isKernel(MF.getFunction().getCallingConv())) {
+ if (!AMDGPU::isKernel(MF.getFunction())) {
// This only makes sense to call in a kernel, so just lower to null.
return DAG.getConstant(0, DL, VT);
}
@@ -10477,9 +10583,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_raw_buffer_atomic_dec:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
- case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
- return lowerRawBufferAtomicIntrin(Op, DAG,
- AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
case Intrinsic::amdgcn_struct_buffer_atomic_swap:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
return lowerStructBufferAtomicIntrin(Op, DAG,
@@ -10521,10 +10624,21 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_struct_buffer_atomic_dec:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
+ case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
+ return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_CSUB);
+ case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
+ return lowerStructBufferAtomicIntrin(Op, DAG,
+ AMDGPUISD::BUFFER_ATOMIC_CSUB);
+ case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
+ return lowerRawBufferAtomicIntrin(Op, DAG,
+ AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
return lowerStructBufferAtomicIntrin(Op, DAG,
AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
-
case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
@@ -11892,7 +12006,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
(AS == AMDGPUAS::GLOBAL_ADDRESS &&
Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
- isMemOpHasNoClobberedMemOperand(Load))) {
+ (Load->isInvariant() || isMemOpHasNoClobberedMemOperand(Load)))) {
if ((!Op->isDivergent() || AMDGPU::isUniformMMO(MMO)) &&
Alignment >= Align(4) && NumElements < 32) {
if (MemVT.isPow2VectorType() ||
@@ -13930,6 +14044,12 @@ static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
assert(OtherOp.getValueSizeInBits() == 32);
}
+ // Check that we haven't just recreated the same FSHR node.
+ if (N->getOpcode() == ISD::FSHR &&
+ (N->getOperand(0) == Op || N->getOperand(0) == OtherOp) &&
+ (N->getOperand(1) == Op || N->getOperand(1) == OtherOp))
+ return SDValue();
+
if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
assert(Op.getValueType().isByteSized() &&
@@ -17361,12 +17481,14 @@ void SITargetLowering::AddMemOpInit(MachineInstr &MI) const {
// Abandon attempt if the dst size isn't large enough
// - this is in fact an error but this is picked up elsewhere and
// reported correctly.
- uint32_t DstSize =
- TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
+ const TargetRegisterClass *DstRC = TII->getRegClass(MI.getDesc(), DstIdx);
+
+ uint32_t DstSize = TRI.getRegSizeInBits(*DstRC) / 32;
if (DstSize < InitIdx)
return;
} else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
- InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
+ const TargetRegisterClass *DstRC = TII->getRegClass(MI.getDesc(), DstIdx);
+ InitIdx = TRI.getRegSizeInBits(*DstRC) / 32;
} else {
return;
}
@@ -17414,7 +17536,7 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
SDNode *Node) const {
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
- MachineFunction *MF = MI.getParent()->getParent();
+ MachineFunction *MF = MI.getMF();
MachineRegisterInfo &MRI = MF->getRegInfo();
if (TII->isVOP3(MI.getOpcode())) {
@@ -17550,6 +17672,8 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
break;
case 'v':
switch (BitWidth) {
+ case 1:
+ return std::pair(0U, nullptr);
case 16:
RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
: &AMDGPU::VGPR_32_Lo256RegClass;
@@ -17567,6 +17691,8 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
if (!Subtarget->hasMAIInsts())
break;
switch (BitWidth) {
+ case 1:
+ return std::pair(0U, nullptr);
case 16:
RC = &AMDGPU::AGPR_32RegClass;
break;
@@ -18252,7 +18378,6 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N,
case AMDGPUISD::BUFFER_ATOMIC_INC:
case AMDGPUISD::BUFFER_ATOMIC_DEC:
case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
- case AMDGPUISD::BUFFER_ATOMIC_CSUB:
case AMDGPUISD::BUFFER_ATOMIC_FADD:
case AMDGPUISD::BUFFER_ATOMIC_FMIN:
case AMDGPUISD::BUFFER_ATOMIC_FMAX:
@@ -18487,7 +18612,19 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
case AtomicRMWInst::UMax:
case AtomicRMWInst::UMin:
case AtomicRMWInst::UIncWrap:
- case AtomicRMWInst::UDecWrap: {
+ case AtomicRMWInst::UDecWrap:
+ case AtomicRMWInst::USubCond:
+ case AtomicRMWInst::USubSat: {
+ if (Op == AtomicRMWInst::USubCond && !Subtarget->hasCondSubInsts())
+ return AtomicExpansionKind::CmpXChg;
+ if (Op == AtomicRMWInst::USubSat && !Subtarget->hasSubClampInsts())
+ return AtomicExpansionKind::CmpXChg;
+ if (Op == AtomicRMWInst::USubCond || Op == AtomicRMWInst::USubSat) {
+ auto *IT = dyn_cast<IntegerType>(RMW->getType());
+ if (!IT || IT->getBitWidth() != 32)
+ return AtomicExpansionKind::CmpXChg;
+ }
+
if (AMDGPU::isFlatGlobalAddrSpace(AS) ||
AS == AMDGPUAS::BUFFER_FAT_POINTER) {
if (Subtarget->hasEmulatedSystemScopeAtomics())
@@ -18752,8 +18889,11 @@ SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
: &AMDGPU::SReg_32RegClass;
if (!TRI->isSGPRClass(RC) && !isDivergent)
return TRI->getEquivalentSGPRClass(RC);
- if (TRI->isSGPRClass(RC) && isDivergent)
+ if (TRI->isSGPRClass(RC) && isDivergent) {
+ if (Subtarget->hasGFX90AInsts())
+ return TRI->getEquivalentAVClass(RC);
return TRI->getEquivalentVGPRClass(RC);
+ }
return RC;
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 74e58f4..fb16294 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -45,6 +45,8 @@ public:
LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
unsigned &NumIntermediates, MVT &RegisterVT) const override;
+ MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const;
+
private:
SDValue lowerKernArgParameterPtr(SelectionDAG &DAG, const SDLoc &SL,
SDValue Chain, uint64_t Offset) const;
@@ -332,7 +334,7 @@ public:
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override;
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override;
- bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &,
+ bool getTgtMemIntrinsic(IntrinsicInfo &, const CallBase &,
MachineFunction &MF,
unsigned IntrinsicID) const override;
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 6dcbced..146f360 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -345,9 +345,7 @@ public:
class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
public:
- WaitcntGeneratorPreGFX12() = default;
- WaitcntGeneratorPreGFX12(const MachineFunction &MF)
- : WaitcntGenerator(MF, NUM_NORMAL_INST_CNTS) {}
+ using WaitcntGenerator::WaitcntGenerator;
bool
applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
@@ -381,10 +379,7 @@ public:
class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
public:
- WaitcntGeneratorGFX12Plus() = default;
- WaitcntGeneratorGFX12Plus(const MachineFunction &MF,
- InstCounterType MaxCounter)
- : WaitcntGenerator(MF, MaxCounter) {}
+ using WaitcntGenerator::WaitcntGenerator;
bool
applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
@@ -531,6 +526,7 @@ public:
// instruction.
WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
switch (Inst.getOpcode()) {
+ // FIXME: GLOBAL_INV needs to be tracked with xcnt too.
case AMDGPU::GLOBAL_INV:
return VMEM_READ_ACCESS; // tracked using loadcnt
case AMDGPU::GLOBAL_WB:
@@ -551,9 +547,7 @@ public:
return VMEM_ACCESS;
if (Inst.mayStore() &&
(!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) {
- // FLAT and SCRATCH instructions may access scratch. Other VMEM
- // instructions do not.
- if (TII->mayAccessScratchThroughFlat(Inst))
+ if (TII->mayAccessScratch(Inst))
return SCRATCH_WRITE_ACCESS;
return VMEM_WRITE_ACCESS;
}
@@ -633,8 +627,11 @@ public:
const MachineOperand &Op) const;
bool counterOutOfOrder(InstCounterType T) const;
- void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
+ void simplifyWaitcnt(AMDGPU::Waitcnt &Wait);
void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
+ bool hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait);
+ bool canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait);
+ void simplifyXcnt(AMDGPU::Waitcnt &CheckWait, AMDGPU::Waitcnt &UpdateWait);
void determineWait(InstCounterType T, RegInterval Interval,
AMDGPU::Waitcnt &Wait) const;
@@ -646,7 +643,6 @@ public:
void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
void applyWaitcnt(InstCounterType T, unsigned Count);
- void applyXcnt(const AMDGPU::Waitcnt &Wait);
void updateByEvent(WaitEventType E, MachineInstr &MI);
unsigned hasPendingEvent() const { return PendingEvents; }
@@ -921,6 +917,7 @@ bool WaitcntBrackets::hasPointSamplePendingVmemTypes(
void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
InstCounterType T = eventCounter(Context->WaitEventMaskForInst, E);
+ assert(T < Context->MaxCounter);
unsigned UB = getScoreUB(T);
unsigned CurrScore = UB + 1;
@@ -1085,13 +1082,17 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
}
}
}
- if (Slot || LDSDMAStores.size() == NUM_LDS_VGPRS - 1)
+ if (Slot)
break;
+ // The slot may not be valid because it can be >= NUM_LDS_VGPRS which
+ // means the scoreboard cannot track it. We still want to preserve the
+ // MI in order to check alias information, though.
LDSDMAStores.push_back(&Inst);
Slot = LDSDMAStores.size();
break;
}
- setRegScore(FIRST_LDS_VGPR + Slot, T, CurrScore);
+ if (Slot < NUM_LDS_VGPRS)
+ setRegScore(FIRST_LDS_VGPR + Slot, T, CurrScore);
if (Slot)
setRegScore(FIRST_LDS_VGPR, T, CurrScore);
}
@@ -1113,33 +1114,33 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
switch (T) {
case LOAD_CNT:
OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
- << SR << "): ";
+ << SR << "):";
break;
case DS_CNT:
OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
- << SR << "): ";
+ << SR << "):";
break;
case EXP_CNT:
- OS << " EXP_CNT(" << SR << "): ";
+ OS << " EXP_CNT(" << SR << "):";
break;
case STORE_CNT:
OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
- << SR << "): ";
+ << SR << "):";
break;
case SAMPLE_CNT:
- OS << " SAMPLE_CNT(" << SR << "): ";
+ OS << " SAMPLE_CNT(" << SR << "):";
break;
case BVH_CNT:
- OS << " BVH_CNT(" << SR << "): ";
+ OS << " BVH_CNT(" << SR << "):";
break;
case KM_CNT:
- OS << " KM_CNT(" << SR << "): ";
+ OS << " KM_CNT(" << SR << "):";
break;
case X_CNT:
- OS << " X_CNT(" << SR << "): ";
+ OS << " X_CNT(" << SR << "):";
break;
default:
- OS << " UNKNOWN(" << SR << "): ";
+ OS << " UNKNOWN(" << SR << "):";
break;
}
@@ -1153,9 +1154,9 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
continue;
unsigned RelScore = RegScore - LB - 1;
if (J < FIRST_LDS_VGPR) {
- OS << RelScore << ":v" << J << " ";
+ OS << ' ' << RelScore << ":v" << J;
} else {
- OS << RelScore << ":ds ";
+ OS << ' ' << RelScore << ":ds";
}
}
// Also need to print sgpr scores for lgkm_cnt or xcnt.
@@ -1165,11 +1166,11 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
if (RegScore <= LB)
continue;
unsigned RelScore = RegScore - LB - 1;
- OS << RelScore << ":s" << J << " ";
+ OS << ' ' << RelScore << ":s" << J;
}
}
if (T == KM_CNT && SCCScore > 0)
- OS << SCCScore << ":scc ";
+ OS << ' ' << SCCScore << ":scc";
}
OS << '\n';
}
@@ -1192,7 +1193,7 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
/// Simplify the waitcnt, in the sense of removing redundant counts, and return
/// whether a waitcnt instruction is needed at all.
-void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
+void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) {
simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt);
simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
simplifyWaitcnt(DS_CNT, Wait.DsCnt);
@@ -1200,7 +1201,7 @@ void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
simplifyWaitcnt(BVH_CNT, Wait.BvhCnt);
simplifyWaitcnt(KM_CNT, Wait.KmCnt);
- simplifyWaitcnt(X_CNT, Wait.XCnt);
+ simplifyXcnt(Wait, Wait);
}
void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
@@ -1270,7 +1271,7 @@ void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
applyWaitcnt(BVH_CNT, Wait.BvhCnt);
applyWaitcnt(KM_CNT, Wait.KmCnt);
- applyXcnt(Wait);
+ applyWaitcnt(X_CNT, Wait.XCnt);
}
void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
@@ -1287,21 +1288,42 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
}
}
-void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
+bool WaitcntBrackets::hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait) {
// Wait on XCNT is redundant if we are already waiting for a load to complete.
// SMEM can return out of order, so only omit XCNT wait if we are waiting till
// zero.
- if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP))
- return applyWaitcnt(X_CNT, 0);
+ return Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP);
+}
+bool WaitcntBrackets::canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait) {
// If we have pending store we cannot optimize XCnt because we do not wait for
// stores. VMEM loads retun in order, so if we only have loads XCnt is
// decremented to the same number as LOADCnt.
- if (Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
- !hasPendingEvent(STORE_CNT))
- return applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
+ return Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
+ !hasPendingEvent(STORE_CNT);
+}
- applyWaitcnt(X_CNT, Wait.XCnt);
+void WaitcntBrackets::simplifyXcnt(AMDGPU::Waitcnt &CheckWait,
+ AMDGPU::Waitcnt &UpdateWait) {
+ // Try to simplify xcnt further by checking for joint kmcnt and loadcnt
+ // optimizations. On entry to a block with multiple predescessors, there may
+ // be pending SMEM and VMEM events active at the same time.
+ // In such cases, only clear one active event at a time.
+ // TODO: Revisit xcnt optimizations for gfx1250.
+ if (hasRedundantXCntWithKmCnt(CheckWait)) {
+ if (!hasMixedPendingEvents(X_CNT)) {
+ applyWaitcnt(X_CNT, 0);
+ } else {
+ PendingEvents &= ~(1 << SMEM_GROUP);
+ }
+ } else if (canOptimizeXCntWithLoadCnt(CheckWait)) {
+ if (!hasMixedPendingEvents(X_CNT)) {
+ applyWaitcnt(X_CNT, std::min(CheckWait.XCnt, CheckWait.LoadCnt));
+ } else if (CheckWait.LoadCnt == 0) {
+ PendingEvents &= ~(1 << VMEM_GROUP);
+ }
+ }
+ simplifyWaitcnt(X_CNT, UpdateWait.XCnt);
}
// Where there are multiple types of event in the bracket of a counter,
@@ -1518,7 +1540,7 @@ bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
Modified = true;
- LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
+ LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
dbgs() << "New Instr: " << *SWaitInst << '\n');
}
@@ -1532,7 +1554,7 @@ bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
.addImm(Wait.StoreCnt);
Modified = true;
- LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
+ LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
dbgs() << "New Instr: " << *SWaitInst << '\n');
}
@@ -1636,6 +1658,8 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
}
}
+ // Save the pre combine waitcnt in order to make xcnt checks.
+ AMDGPU::Waitcnt PreCombine = Wait;
if (CombinedLoadDsCntInstr) {
// Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
// to be waited for. Otherwise, let the instruction be deleted so
@@ -1726,6 +1750,13 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
}
for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
+ if ((CT == KM_CNT && ScoreBrackets.hasRedundantXCntWithKmCnt(PreCombine)) ||
+ (CT == LOAD_CNT &&
+ ScoreBrackets.canOptimizeXCntWithLoadCnt(PreCombine))) {
+ // Xcnt may need to be updated depending on a pre-existing KM/LOAD_CNT
+ // due to taking the backedge of a block.
+ ScoreBrackets.simplifyXcnt(PreCombine, Wait);
+ }
if (!WaitInstrs[CT])
continue;
@@ -1790,7 +1821,7 @@ bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
if (SWaitInst) {
Modified = true;
- LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
+ LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";
if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
dbgs() << "New Instr: " << *SWaitInst << '\n');
}
@@ -1810,7 +1841,7 @@ bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
Modified = true;
- LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
+ LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";
if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
dbgs() << "New Instr: " << *SWaitInst << '\n');
}
@@ -1979,15 +2010,23 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
if (Ptr && Memop->getAAInfo()) {
const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
- if (MI.mayAlias(AA, *LDSDMAStores[I], true))
+ if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {
+ if ((I + 1) >= NUM_LDS_VGPRS) {
+ // We didn't have enough slot to track this LDS DMA store, it
+ // has been tracked using the common RegNo (FIRST_LDS_VGPR).
+ ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
+ break;
+ }
+
ScoreBrackets.determineWait(LOAD_CNT, RegNo + I + 1, Wait);
+ }
}
} else {
ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
}
- if (Memop->isStore()) {
+
+ if (Memop->isStore())
ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
- }
}
// Loop over use and def operands.
@@ -2072,6 +2111,14 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
// Verify that the wait is actually needed.
ScoreBrackets.simplifyWaitcnt(Wait);
+ // Since the translation for VMEM addresses occur in-order, we can apply the
+ // XCnt if the current instruction is of VMEM type and has a memory
+ // dependency with another VMEM instruction in flight.
+ if (Wait.XCnt != ~0u && isVmemAccess(MI)) {
+ ScoreBrackets.applyWaitcnt(X_CNT, Wait.XCnt);
+ Wait.XCnt = ~0u;
+ }
+
// When forcing emit, we need to skip terminators because that would break the
// terminators of the MBB if we emit a waitcnt between terminators.
if (ForceEmitZeroFlag && !MI.isTerminator())
@@ -2140,21 +2187,6 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
<< "Update Instr: " << *It);
}
- // XCnt may be already consumed by a load wait.
- if (Wait.XCnt != ~0u) {
- if (Wait.KmCnt == 0 && !ScoreBrackets.hasPendingEvent(SMEM_GROUP))
- Wait.XCnt = ~0u;
-
- if (Wait.LoadCnt == 0 && !ScoreBrackets.hasPendingEvent(VMEM_GROUP))
- Wait.XCnt = ~0u;
-
- // Since the translation for VMEM addresses occur in-order, we can skip the
- // XCnt if the current instruction is of VMEM type and has a memory
- // dependency with another VMEM instruction in flight.
- if (isVmemAccess(*It))
- Wait.XCnt = ~0u;
- }
-
if (WCG->createNewWaitcnt(Block, It, Wait))
Modified = true;
@@ -2265,10 +2297,12 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
ScoreBrackets->updateByEvent(LDS_ACCESS, Inst);
}
- // This is a flat memory operation that access both VMEM and LDS, so note it
- // - it will require that both the VM and LGKM be flushed to zero if it is
- // pending when a VM or LGKM dependency occurs.
- if (FlatASCount > 1)
+ // Async/LDSDMA operations have FLAT encoding but do not actually use flat
+ // pointers. They do have two operands that each access global and LDS, thus
+ // making it appear at this point that they are using a flat pointer. Filter
+ // them out, and for the rest, generate a dependency on flat pointers so
+ // that both VM and LGKM counters are flushed.
+ if (!SIInstrInfo::isLDSDMA(Inst) && FlatASCount > 1)
ScoreBrackets->setPendingFlat();
} else if (SIInstrInfo::isVMEM(Inst) &&
!llvm::AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode())) {
@@ -2720,7 +2754,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
WCG = &WCGGFX12Plus;
} else {
MaxCounter = NUM_NORMAL_INST_CNTS;
- WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF);
+ WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF, MaxCounter);
WCG = &WCGPreGFX12;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d930a21..6d21109 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -63,7 +63,8 @@ static cl::opt<bool> Fix16BitCopies(
cl::ReallyHidden);
SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST)
- : AMDGPUGenInstrInfo(ST, AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
+ : AMDGPUGenInstrInfo(ST, RI, AMDGPU::ADJCALLSTACKUP,
+ AMDGPU::ADJCALLSTACKDOWN),
RI(ST), ST(ST) {
SchedModel.init(&ST);
}
@@ -161,7 +162,7 @@ bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const {
if (!DstReg.isVirtual())
return true;
- const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
switch (Use.getOpcode()) {
case AMDGPU::S_AND_SAVEEXEC_B32:
@@ -1667,8 +1668,7 @@ unsigned SIInstrInfo::getVectorRegSpillSaveOpcode(
void SIInstrInfo::storeRegToStackSlot(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
- bool isKill, int FrameIndex, const TargetRegisterClass *RC,
- const TargetRegisterInfo *TRI, Register VReg,
+ bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
MachineInstr::MIFlag Flags) const {
MachineFunction *MF = MBB.getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
@@ -1680,7 +1680,7 @@ void SIInstrInfo::storeRegToStackSlot(
MachineMemOperand *MMO = MF->getMachineMemOperand(
PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
FrameInfo.getObjectAlign(FrameIndex));
- unsigned SpillSize = TRI->getSpillSize(*RC);
+ unsigned SpillSize = RI.getSpillSize(*RC);
MachineRegisterInfo &MRI = MF->getRegInfo();
if (RI.isSGPRClass(RC)) {
@@ -1862,14 +1862,13 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
Register DestReg, int FrameIndex,
const TargetRegisterClass *RC,
- const TargetRegisterInfo *TRI,
Register VReg,
MachineInstr::MIFlag Flags) const {
MachineFunction *MF = MBB.getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
MachineFrameInfo &FrameInfo = MF->getFrameInfo();
const DebugLoc &DL = MBB.findDebugLoc(MI);
- unsigned SpillSize = TRI->getSpillSize(*RC);
+ unsigned SpillSize = RI.getSpillSize(*RC);
MachinePointerInfo PtrInfo
= MachinePointerInfo::getFixedStack(*MF, FrameIndex);
@@ -1964,6 +1963,10 @@ MachineBasicBlock *SIInstrInfo::insertSimulatedTrap(MachineRegisterInfo &MRI,
BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
MF->push_back(TrapBB);
MBB.addSuccessor(TrapBB);
+ } else {
+ // Since we're adding HaltLoopBB and modifying the CFG, we must return a
+ // different block to signal the change.
+ ContBB = HaltLoopBB;
}
// Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
@@ -2518,8 +2521,8 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I, Register DestReg,
- unsigned SubIdx, const MachineInstr &Orig,
- const TargetRegisterInfo &RI) const {
+ unsigned SubIdx,
+ const MachineInstr &Orig) const {
// Try shrinking the instruction to remat only the part needed for current
// context.
@@ -2569,7 +2572,7 @@ void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB,
const MCInstrDesc &TID = get(NewOpcode);
const TargetRegisterClass *NewRC =
- RI.getAllocatableClass(getRegClass(TID, 0, &RI));
+ RI.getAllocatableClass(getRegClass(TID, 0));
MRI.setRegClass(DestReg, NewRC);
UseMO->setReg(DestReg);
@@ -2599,7 +2602,7 @@ void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB,
break;
}
- TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI);
+ TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig);
}
std::pair<MachineInstr*, MachineInstr*>
@@ -2935,7 +2938,7 @@ void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
if (FlushSGPRWrites)
BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR))
- .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
+ .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0, ST));
};
// We need to compute the offset relative to the instruction immediately after
@@ -3461,6 +3464,21 @@ void SIInstrInfo::removeModOperands(MachineInstr &MI) const {
}
}
+void SIInstrInfo::mutateAndCleanupImplicit(MachineInstr &MI,
+ const MCInstrDesc &NewDesc) const {
+ MI.setDesc(NewDesc);
+
+ // Remove any leftover implicit operands from mutating the instruction. e.g.
+ // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
+ // anymore.
+ const MCInstrDesc &Desc = MI.getDesc();
+ unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
+ Desc.implicit_defs().size();
+
+ for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
+ MI.removeOperand(I);
+}
+
std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
unsigned SubRegIndex) {
switch (SubRegIndex) {
@@ -3612,7 +3630,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
const MCInstrDesc &MovDesc = get(MovOp);
- const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0, &RI);
+ const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0);
if (Is16Bit) {
// We just need to find a correctly sized register class, so the
// subregister index compatibility doesn't matter since we're statically
@@ -3917,6 +3935,9 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
if (isLDSDMA(MIa) || isLDSDMA(MIb))
return false;
+ if (MIa.isBundle() || MIb.isBundle())
+ return false;
+
// TODO: Should we check the address space from the MachineMemOperand? That
// would allow us to distinguish objects we know don't alias based on the
// underlying address space, even if it was lowered to a different one,
@@ -3982,7 +4003,7 @@ static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
MachineInstr **DefMI = nullptr) {
if (!MO->isReg())
return false;
- const MachineFunction *MF = MO->getParent()->getParent()->getParent();
+ const MachineFunction *MF = MO->getParent()->getMF();
const MachineRegisterInfo &MRI = MF->getRegInfo();
return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
}
@@ -4044,10 +4065,29 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
LiveVariables *LV,
LiveIntervals *LIS) const {
MachineBasicBlock &MBB = *MI.getParent();
+ MachineInstr *CandidateMI = &MI;
+
+ if (MI.isBundle()) {
+ // This is a temporary placeholder for bundle handling that enables us to
+ // exercise the relevant code paths in the two-address instruction pass.
+ if (MI.getBundleSize() != 1)
+ return nullptr;
+ CandidateMI = MI.getNextNode();
+ }
+
ThreeAddressUpdates U;
- MachineInstr *NewMI = convertToThreeAddressImpl(MI, U);
+ MachineInstr *NewMI = convertToThreeAddressImpl(*CandidateMI, U);
+ if (!NewMI)
+ return nullptr;
+
+ if (MI.isBundle()) {
+ CandidateMI->eraseFromBundle();
- if (NewMI) {
+ for (MachineOperand &MO : MI.all_defs()) {
+ if (MO.isTied())
+ MI.untieRegOperand(MO.getOperandNo());
+ }
+ } else {
updateLiveVariables(LV, MI, *NewMI);
if (LIS) {
LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
@@ -4088,7 +4128,22 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
LV->getVarInfo(DefReg).AliveBlocks.clear();
}
- if (LIS) {
+ if (MI.isBundle()) {
+ VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
+ if (!VRI.Reads && !VRI.Writes) {
+ for (MachineOperand &MO : MI.all_uses()) {
+ if (MO.isReg() && MO.getReg() == DefReg) {
+ assert(MO.getSubReg() == 0 &&
+ "tied sub-registers in bundles currently not supported");
+ MI.removeOperand(MO.getOperandNo());
+ break;
+ }
+ }
+
+ if (LIS)
+ LIS->shrinkToUses(&LIS->getInterval(DefReg));
+ }
+ } else if (LIS) {
LiveInterval &DefLI = LIS->getInterval(DefReg);
// We cannot delete the original instruction here, so hack out the use
@@ -4103,11 +4158,26 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
}
}
+ if (MI.isBundle()) {
+ VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
+ if (!VRI.Reads && !VRI.Writes) {
+ for (MachineOperand &MIOp : MI.uses()) {
+ if (MIOp.isReg() && MIOp.getReg() == DefReg) {
+ MIOp.setIsUndef(true);
+ MIOp.setReg(DummyReg);
+ }
+ }
+ }
+
+ MI.addOperand(MachineOperand::CreateReg(DummyReg, false, false, false,
+ false, /*isUndef=*/true));
+ }
+
LIS->shrinkToUses(&DefLI);
}
}
- return NewMI;
+ return MI.isBundle() ? &MI : NewMI;
}
MachineInstr *
@@ -4121,7 +4191,7 @@ SIInstrInfo::convertToThreeAddressImpl(MachineInstr &MI,
if (NewMFMAOpc != -1) {
MachineInstrBuilder MIB =
BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
- for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
+ for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
MIB.add(MI.getOperand(I));
return MIB;
}
@@ -4130,7 +4200,7 @@ SIInstrInfo::convertToThreeAddressImpl(MachineInstr &MI,
unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
.setMIFlags(MI.getFlags());
- for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
+ for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
MIB->addOperand(MI.getOperand(I));
return MIB;
}
@@ -4329,8 +4399,9 @@ bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const {
Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
}
-bool SIInstrInfo::mayAccessScratchThroughFlat(const MachineInstr &MI) const {
- if (!isFLAT(MI) || isFLATGlobal(MI))
+bool SIInstrInfo::mayAccessScratch(const MachineInstr &MI) const {
+ // Instructions that access scratch use FLAT encoding or BUF encodings.
+ if ((!isFLAT(MI) || isFLATGlobal(MI)) && !isBUF(MI))
return false;
// If scratch is not initialized, we can never access it.
@@ -4948,7 +5019,7 @@ bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
StringRef &ErrInfo) const {
uint16_t Opcode = MI.getOpcode();
- const MachineFunction *MF = MI.getParent()->getParent();
+ const MachineFunction *MF = MI.getMF();
const MachineRegisterInfo &MRI = MF->getRegInfo();
// FIXME: At this point the COPY verify is done only for non-ssa forms.
@@ -5452,9 +5523,10 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
Desc.getNumOperands() + Desc.implicit_uses().size();
const unsigned NumImplicitOps = IsDst ? 2 : 1;
- // Allow additional implicit operands. This allows a fixup done by the post
- // RA scheduler where the main implicit operand is killed and implicit-defs
- // are added for sub-registers that remain live after this instruction.
+ // Require additional implicit operands. This allows a fixup done by the
+ // post RA scheduler where the main implicit operand is killed and
+ // implicit-defs are added for sub-registers that remain live after this
+ // instruction.
if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
ErrInfo = "missing implicit register operands";
return false;
@@ -5736,6 +5808,17 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
}
+ if (ST.hasFlatScratchHiInB64InstHazard() && isSALU(MI) &&
+ MI.readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, nullptr)) {
+ const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
+ if ((Dst && RI.getRegClassForReg(MRI, Dst->getReg()) ==
+ &AMDGPU::SReg_64RegClass) ||
+ Opcode == AMDGPU::S_BITCMP0_B64 || Opcode == AMDGPU::S_BITCMP1_B64) {
+ ErrInfo = "Instruction cannot read flat_scratch_base_hi";
+ return false;
+ }
+ }
+
return true;
}
@@ -5754,7 +5837,7 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
case AMDGPU::S_MOV_B32: {
- const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
return MI.getOperand(1).isReg() ||
RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
@@ -6021,19 +6104,6 @@ SIInstrInfo::getWholeWaveFunctionSetup(MachineFunction &MF) const {
llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
}
-// FIXME: This should not be an overridable function. All subtarget dependent
-// operand modifications should go through isLookupRegClassByHwMode in the
-// generic handling.
-const TargetRegisterClass *
-SIInstrInfo::getRegClass(const MCInstrDesc &TID, unsigned OpNum,
- const TargetRegisterInfo *TRI) const {
- if (OpNum >= TID.getNumOperands())
- return nullptr;
- const MCOperandInfo &OpInfo = TID.operands()[OpNum];
- int16_t RegClass = getOpRegClassID(OpInfo);
- return RI.getRegClass(RegClass);
-}
-
const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
unsigned OpNo) const {
const MCInstrDesc &Desc = get(MI.getOpcode());
@@ -6042,14 +6112,14 @@ const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
Register Reg = MI.getOperand(OpNo).getReg();
if (Reg.isVirtual()) {
- const MachineRegisterInfo &MRI =
- MI.getParent()->getParent()->getRegInfo();
+ const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
return MRI.getRegClass(Reg);
}
return RI.getPhysRegBaseClass(Reg);
}
- return RI.getRegClass(getOpRegClassID(Desc.operands()[OpNo]));
+ int16_t RegClass = getOpRegClassID(Desc.operands()[OpNo]);
+ return RegClass < 0 ? nullptr : RI.getRegClass(RegClass);
}
void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
@@ -6133,7 +6203,7 @@ bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
const TargetRegisterClass *RC = MRI.getRegClass(Reg);
if (MO.getSubReg()) {
- const MachineFunction *MF = MO.getParent()->getParent()->getParent();
+ const MachineFunction *MF = MO.getParent()->getMF();
const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
if (!SuperRC)
return false;
@@ -6145,7 +6215,7 @@ bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx,
const MachineOperand &MO) const {
- const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
unsigned Opc = MI.getOpcode();
@@ -6153,7 +6223,7 @@ bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx,
// information.
if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) &&
MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) {
- constexpr const AMDGPU::OpName OpNames[] = {
+ constexpr AMDGPU::OpName OpNames[] = {
AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
for (auto [I, OpName] : enumerate(OpNames)) {
@@ -6198,6 +6268,18 @@ bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx,
(int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
RI.isSGPRReg(MRI, MO.getReg()))
return false;
+
+ if (ST.hasFlatScratchHiInB64InstHazard() &&
+ MO.getReg() == AMDGPU::SRC_FLAT_SCRATCH_BASE_HI && isSALU(MI)) {
+ if (const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst)) {
+ if (AMDGPU::getRegBitWidth(*RI.getRegClassForReg(MRI, Dst->getReg())) ==
+ 64)
+ return false;
+ }
+ if (Opc == AMDGPU::S_BITCMP0_B64 || Opc == AMDGPU::S_BITCMP1_B64)
+ return false;
+ }
+
return true;
}
@@ -6215,8 +6297,8 @@ bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
bool SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand(
const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
const MachineOperand *MO) const {
- constexpr const unsigned NumOps = 3;
- constexpr const AMDGPU::OpName OpNames[NumOps * 2] = {
+ constexpr unsigned NumOps = 3;
+ constexpr AMDGPU::OpName OpNames[NumOps * 2] = {
AMDGPU::OpName::src0, AMDGPU::OpName::src1,
AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers,
AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};
@@ -6247,7 +6329,7 @@ bool SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand(
bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
const MachineOperand *MO) const {
- const MachineFunction &MF = *MI.getParent()->getParent();
+ const MachineFunction &MF = *MI.getMF();
const MachineRegisterInfo &MRI = MF.getRegInfo();
const MCInstrDesc &InstDesc = MI.getDesc();
const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
@@ -6801,7 +6883,7 @@ void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI,
return;
const TargetRegisterClass *DeclaredRC =
- getRegClass(MI.getDesc(), SAddr->getOperandNo(), &RI);
+ getRegClass(MI.getDesc(), SAddr->getOperandNo());
Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC);
SAddr->setReg(ToSGPR);
@@ -7143,7 +7225,7 @@ extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) {
MachineBasicBlock *
SIInstrInfo::legalizeOperands(MachineInstr &MI,
MachineDominatorTree *MDT) const {
- MachineFunction &MF = *MI.getParent()->getParent();
+ MachineFunction &MF = *MI.getMF();
MachineRegisterInfo &MRI = MF.getRegInfo();
MachineBasicBlock *CreatedBB = nullptr;
@@ -7632,6 +7714,8 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
unsigned Opcode = Inst.getOpcode();
unsigned NewOpcode = getVALUOp(Inst);
+ const DebugLoc &DL = Inst.getDebugLoc();
+
// Handle some special cases
switch (Opcode) {
default:
@@ -7783,6 +7867,11 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
Inst.eraseFromParent();
return;
+ case AMDGPU::S_ABSDIFF_I32:
+ lowerScalarAbsDiff(Worklist, Inst);
+ Inst.eraseFromParent();
+ return;
+
case AMDGPU::S_CBRANCH_SCC0:
case AMDGPU::S_CBRANCH_SCC1: {
// Clear unused bits of vcc
@@ -7869,7 +7958,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
return;
case AMDGPU::S_UADDO_PSEUDO:
case AMDGPU::S_USUBO_PSEUDO: {
- const DebugLoc &DL = Inst.getDebugLoc();
MachineOperand &Dest0 = Inst.getOperand(0);
MachineOperand &Dest1 = Inst.getOperand(1);
MachineOperand &Src0 = Inst.getOperand(2);
@@ -7889,12 +7977,37 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
legalizeOperands(*NewInstr, MDT);
MRI.replaceRegWith(Dest0.getReg(), DestReg);
- addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
- Worklist);
+ addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
Inst.eraseFromParent();
}
return;
+ case AMDGPU::S_LSHL1_ADD_U32:
+ case AMDGPU::S_LSHL2_ADD_U32:
+ case AMDGPU::S_LSHL3_ADD_U32:
+ case AMDGPU::S_LSHL4_ADD_U32: {
+ MachineOperand &Dest = Inst.getOperand(0);
+ MachineOperand &Src0 = Inst.getOperand(1);
+ MachineOperand &Src1 = Inst.getOperand(2);
+ unsigned ShiftAmt = (Opcode == AMDGPU::S_LSHL1_ADD_U32 ? 1
+ : Opcode == AMDGPU::S_LSHL2_ADD_U32 ? 2
+ : Opcode == AMDGPU::S_LSHL3_ADD_U32 ? 3
+ : 4);
+
+ const TargetRegisterClass *NewRC =
+ RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg()));
+ Register DestReg = MRI.createVirtualRegister(NewRC);
+ MachineInstr *NewInstr =
+ BuildMI(*MBB, &Inst, DL, get(AMDGPU::V_LSHL_ADD_U32_e64), DestReg)
+ .add(Src0)
+ .addImm(ShiftAmt)
+ .add(Src1);
+ legalizeOperands(*NewInstr, MDT);
+ MRI.replaceRegWith(Dest.getReg(), DestReg);
+ addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
+ Inst.eraseFromParent();
+ }
+ return;
case AMDGPU::S_CSELECT_B32:
case AMDGPU::S_CSELECT_B64:
lowerSelect(Worklist, Inst, MDT);
@@ -7945,7 +8058,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
}
legalizeOperands(*NewInstr, MDT);
int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
- MachineOperand SCCOp = Inst.getOperand(SCCIdx);
+ const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
Inst.eraseFromParent();
return;
@@ -7985,13 +8098,12 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
legalizeOperandsVALUt16(*NewInstr, MRI);
legalizeOperands(*NewInstr, MDT);
int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
- MachineOperand SCCOp = Inst.getOperand(SCCIdx);
+ const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
Inst.eraseFromParent();
return;
}
case AMDGPU::S_CVT_HI_F32_F16: {
- const DebugLoc &DL = Inst.getDebugLoc();
Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
if (ST.useRealTrue16Insts()) {
@@ -8021,7 +8133,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
}
case AMDGPU::S_MINIMUM_F32:
case AMDGPU::S_MAXIMUM_F32: {
- const DebugLoc &DL = Inst.getDebugLoc();
Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
.addImm(0) // src0_modifiers
@@ -8039,7 +8150,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
}
case AMDGPU::S_MINIMUM_F16:
case AMDGPU::S_MAXIMUM_F16: {
- const DebugLoc &DL = Inst.getDebugLoc();
Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
? &AMDGPU::VGPR_16RegClass
: &AMDGPU::VGPR_32RegClass);
@@ -8063,7 +8173,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
case AMDGPU::V_S_RCP_F16_e64:
case AMDGPU::V_S_RSQ_F16_e64:
case AMDGPU::V_S_SQRT_F16_e64: {
- const DebugLoc &DL = Inst.getDebugLoc();
Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
? &AMDGPU::VGPR_16RegClass
: &AMDGPU::VGPR_32RegClass);
@@ -8112,26 +8221,34 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
return;
}
- if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
- NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
- // Instead of creating a copy where src and dst are the same register
- // class, we just replace all uses of dst with src. These kinds of
- // copies interfere with the heuristics MachineSink uses to decide
- // whether or not to split a critical edge. Since the pass assumes
- // that copies will end up as machine instructions and not be
- // eliminated.
- addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
+ if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual()) {
Register NewDstReg = Inst.getOperand(1).getReg();
- MRI.replaceRegWith(DstReg, NewDstReg);
- MRI.clearKillFlags(NewDstReg);
- Inst.getOperand(0).setReg(DstReg);
- Inst.eraseFromParent();
- // Legalize t16 operand since replaceReg is called after addUsersToVALU
- for (MachineOperand &MO :
- make_early_inc_range(MRI.use_operands(NewDstReg))) {
- legalizeOperandsVALUt16(*MO.getParent(), MRI);
+ const TargetRegisterClass *SrcRC = RI.getRegClassForReg(MRI, NewDstReg);
+ if (const TargetRegisterClass *CommonRC =
+ RI.getCommonSubClass(NewDstRC, SrcRC)) {
+ // Instead of creating a copy where src and dst are the same register
+ // class, we just replace all uses of dst with src. These kinds of
+ // copies interfere with the heuristics MachineSink uses to decide
+ // whether or not to split a critical edge. Since the pass assumes
+ // that copies will end up as machine instructions and not be
+ // eliminated.
+ addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
+ MRI.replaceRegWith(DstReg, NewDstReg);
+ MRI.clearKillFlags(NewDstReg);
+ Inst.getOperand(0).setReg(DstReg);
+
+ if (!MRI.constrainRegClass(NewDstReg, CommonRC))
+ llvm_unreachable("failed to constrain register");
+
+ Inst.eraseFromParent();
+ // Legalize t16 operand since replaceReg is called after addUsersToVALU
+ for (MachineOperand &MO :
+ make_early_inc_range(MRI.use_operands(NewDstReg))) {
+ legalizeOperandsVALUt16(*MO.getParent(), MRI);
+ }
+
+ return;
}
- return;
}
// If this is a v2s copy between 16bit and 32bit reg,
@@ -8183,7 +8300,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
AMDGPU::OpName::src0_modifiers) >= 0)
NewInstr.addImm(0);
if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
- MachineOperand Src = Inst.getOperand(1);
+ const MachineOperand &Src = Inst.getOperand(1);
NewInstr->addOperand(Src);
}
@@ -8412,6 +8529,37 @@ void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
}
+void SIInstrInfo::lowerScalarAbsDiff(SIInstrWorklist &Worklist,
+ MachineInstr &Inst) const {
+ MachineBasicBlock &MBB = *Inst.getParent();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ MachineBasicBlock::iterator MII = Inst;
+ const DebugLoc &DL = Inst.getDebugLoc();
+
+ MachineOperand &Dest = Inst.getOperand(0);
+ MachineOperand &Src1 = Inst.getOperand(1);
+ MachineOperand &Src2 = Inst.getOperand(2);
+ Register SubResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+ unsigned SubOp =
+ ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
+
+ BuildMI(MBB, MII, DL, get(SubOp), SubResultReg)
+ .addReg(Src1.getReg())
+ .addReg(Src2.getReg());
+
+ BuildMI(MBB, MII, DL, get(SubOp), TmpReg).addImm(0).addReg(SubResultReg);
+
+ BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
+ .addReg(SubResultReg)
+ .addReg(TmpReg);
+
+ MRI.replaceRegWith(Dest.getReg(), ResultReg);
+ addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
+}
+
void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
MachineInstr &Inst) const {
MachineBasicBlock &MBB = *Inst.getParent();
@@ -9199,7 +9347,7 @@ void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
}
-void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
+void SIInstrInfo::addSCCDefUsersToVALUWorklist(const MachineOperand &Op,
MachineInstr &SCCDefInst,
SIInstrWorklist &Worklist,
Register NewCond) const {
@@ -9217,7 +9365,7 @@ void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
if (SCCIdx != -1) {
if (MI.isCopy()) {
- MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
Register DestReg = MI.getOperand(0).getReg();
MRI.replaceRegWith(DestReg, NewCond);
@@ -9329,7 +9477,7 @@ Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
return SGPRReg;
Register UsedSGPRs[3] = {Register()};
- const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
for (unsigned i = 0; i < 3; ++i) {
int Idx = OpIndices[i];
@@ -9579,7 +9727,7 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
return getInstBundleSize(MI);
case TargetOpcode::INLINEASM:
case TargetOpcode::INLINEASM_BR: {
- const MachineFunction *MF = MI.getParent()->getParent();
+ const MachineFunction *MF = MI.getMF();
const char *AsmStr = MI.getOperand(0).getSymbolName();
return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
}
@@ -9714,7 +9862,7 @@ bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI,
// needed by the prolog. However, the insertions for scalar registers can
// always be placed at the BB top as they are independent of the exec mask
// value.
- const MachineFunction *MF = MI.getParent()->getParent();
+ const MachineFunction *MF = MI.getMF();
bool IsNullOrVectorRegister = true;
if (Reg) {
const MachineRegisterInfo &MRI = MF->getRegInfo();
@@ -10160,7 +10308,7 @@ static bool followSubRegDef(MachineInstr &MI,
}
MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
- MachineRegisterInfo &MRI) {
+ const MachineRegisterInfo &MRI) {
assert(MRI.isSSA());
if (!P.Reg.isVirtual())
return nullptr;
@@ -10501,7 +10649,7 @@ SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
return InstructionUniformity::Default;
}
- const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
// FIXME: It's conceptually broken to report this for an instruction, and not
@@ -10618,6 +10766,44 @@ bool SIInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
return false;
}
+// SCC is already valid after SCCValid.
+// SCCRedefine will redefine SCC to the same value already available after
+// SCCValid. If there are no intervening SCC conflicts delete SCCRedefine and
+// update kill/dead flags if necessary.
+static bool optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine,
+ const SIRegisterInfo &RI) {
+ MachineInstr *KillsSCC = nullptr;
+ if (SCCValid->getParent() != SCCRedefine->getParent())
+ return false;
+ for (MachineInstr &MI : make_range(std::next(SCCValid->getIterator()),
+ SCCRedefine->getIterator())) {
+ if (MI.modifiesRegister(AMDGPU::SCC, &RI))
+ return false;
+ if (MI.killsRegister(AMDGPU::SCC, &RI))
+ KillsSCC = &MI;
+ }
+ if (MachineOperand *SccDef =
+ SCCValid->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
+ SccDef->setIsDead(false);
+ if (KillsSCC)
+ KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
+ SCCRedefine->eraseFromParent();
+ return true;
+}
+
+static bool foldableSelect(const MachineInstr &Def) {
+ if (Def.getOpcode() != AMDGPU::S_CSELECT_B32 &&
+ Def.getOpcode() != AMDGPU::S_CSELECT_B64)
+ return false;
+ bool Op1IsNonZeroImm =
+ Def.getOperand(1).isImm() && Def.getOperand(1).getImm() != 0;
+ bool Op2IsZeroImm =
+ Def.getOperand(2).isImm() && Def.getOperand(2).getImm() == 0;
+ if (!Op1IsNonZeroImm || !Op2IsZeroImm)
+ return false;
+ return true;
+}
+
bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
Register SrcReg2, int64_t CmpMask,
int64_t CmpValue,
@@ -10633,23 +10819,10 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
if (CmpValue != 0)
return false;
- MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
- if (!Def || Def->getParent() != CmpInstr.getParent())
+ MachineInstr *Def = MRI->getVRegDef(SrcReg);
+ if (!Def)
return false;
- const auto foldableSelect = [](MachineInstr *Def) -> bool {
- if (Def->getOpcode() == AMDGPU::S_CSELECT_B32 ||
- Def->getOpcode() == AMDGPU::S_CSELECT_B64) {
- bool Op1IsNonZeroImm =
- Def->getOperand(1).isImm() && Def->getOperand(1).getImm() != 0;
- bool Op2IsZeroImm =
- Def->getOperand(2).isImm() && Def->getOperand(2).getImm() == 0;
- if (Op1IsNonZeroImm && Op2IsZeroImm)
- return true;
- }
- return false;
- };
-
// For S_OP that set SCC = DST!=0, do the transformation
//
// s_cmp_lg_* (S_OP ...), 0 => (S_OP ...)
@@ -10660,24 +10833,38 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
//
// s_cmp_lg_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT* (non-zero
// imm), 0)
- if (!setsSCCifResultIsNonZero(*Def) && !foldableSelect(Def))
+ if (!setsSCCifResultIsNonZero(*Def) && !foldableSelect(*Def))
return false;
- MachineInstr *KillsSCC = nullptr;
- for (MachineInstr &MI :
- make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) {
- if (MI.modifiesRegister(AMDGPU::SCC, &RI))
- return false;
- if (MI.killsRegister(AMDGPU::SCC, &RI))
- KillsSCC = &MI;
- }
+ if (!optimizeSCC(Def, &CmpInstr, RI))
+ return false;
- if (MachineOperand *SccDef =
- Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
- SccDef->setIsDead(false);
- if (KillsSCC)
- KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
- CmpInstr.eraseFromParent();
+ // If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit
+ // s_cmp_lg of a register pair) and the inputs are the hi and lo-halves of a
+ // 64-bit foldableSelect then delete s_or_b32 in the sequence:
+ // sX = s_cselect_b64 (non-zero imm), 0
+ // sLo = copy sX.sub0
+ // sHi = copy sX.sub1
+ // sY = s_or_b32 sLo, sHi
+ if (Def->getOpcode() == AMDGPU::S_OR_B32 &&
+ MRI->use_nodbg_empty(Def->getOperand(0).getReg())) {
+ const MachineOperand &OrOpnd1 = Def->getOperand(1);
+ const MachineOperand &OrOpnd2 = Def->getOperand(2);
+ if (OrOpnd1.isReg() && OrOpnd2.isReg()) {
+ MachineInstr *Def1 = MRI->getVRegDef(OrOpnd1.getReg());
+ MachineInstr *Def2 = MRI->getVRegDef(OrOpnd2.getReg());
+ if (Def1 && Def1->getOpcode() == AMDGPU::COPY && Def2 &&
+ Def2->getOpcode() == AMDGPU::COPY && Def1->getOperand(1).isReg() &&
+ Def2->getOperand(1).isReg() &&
+ Def1->getOperand(1).getSubReg() == AMDGPU::sub0 &&
+ Def2->getOperand(1).getSubReg() == AMDGPU::sub1 &&
+ Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) {
+ MachineInstr *Select = MRI->getVRegDef(Def1->getOperand(1).getReg());
+ if (Select && foldableSelect(*Select))
+ optimizeSCC(Select, Def, RI);
+ }
+ }
+ }
return true;
};
@@ -10707,8 +10894,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
// s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
// s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
- MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
- if (!Def || Def->getParent() != CmpInstr.getParent())
+ MachineInstr *Def = MRI->getVRegDef(SrcReg);
+ if (!Def)
return false;
if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
@@ -10755,21 +10942,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
return false;
- MachineInstr *KillsSCC = nullptr;
- for (MachineInstr &MI :
- make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) {
- if (MI.modifiesRegister(AMDGPU::SCC, &RI))
- return false;
- if (MI.killsRegister(AMDGPU::SCC, &RI))
- KillsSCC = &MI;
- }
-
- MachineOperand *SccDef =
- Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
- SccDef->setIsDead(false);
- if (KillsSCC)
- KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
- CmpInstr.eraseFromParent();
+ if (!optimizeSCC(Def, &CmpInstr, RI))
+ return false;
if (!MRI->use_nodbg_empty(DefReg)) {
assert(!IsReversedCC);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index dc23a21..b1d6563 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -136,6 +136,8 @@ private:
void lowerScalarAbs(SIInstrWorklist &Worklist, MachineInstr &Inst) const;
+ void lowerScalarAbsDiff(SIInstrWorklist &Worklist, MachineInstr &Inst) const;
+
void lowerScalarXnor(SIInstrWorklist &Worklist, MachineInstr &Inst) const;
void splitScalarNotBinop(SIInstrWorklist &Worklist, MachineInstr &Inst,
@@ -172,7 +174,7 @@ private:
void addUsersToMoveToVALUWorklist(Register Reg, MachineRegisterInfo &MRI,
SIInstrWorklist &Worklist) const;
- void addSCCDefUsersToVALUWorklist(MachineOperand &Op,
+ void addSCCDefUsersToVALUWorklist(const MachineOperand &Op,
MachineInstr &SCCDefInst,
SIInstrWorklist &Worklist,
Register NewCond = Register()) const;
@@ -307,22 +309,19 @@ public:
void storeRegToStackSlot(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
- bool isKill, int FrameIndex, const TargetRegisterClass *RC,
- const TargetRegisterInfo *TRI, Register VReg,
+ bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
void loadRegFromStackSlot(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg,
- int FrameIndex, const TargetRegisterClass *RC,
- const TargetRegisterInfo *TRI, Register VReg,
+ int FrameIndex, const TargetRegisterClass *RC, Register VReg,
MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
bool expandPostRAPseudo(MachineInstr &MI) const override;
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
Register DestReg, unsigned SubIdx,
- const MachineInstr &Orig,
- const TargetRegisterInfo &TRI) const override;
+ const MachineInstr &Orig) const override;
// Splits a V_MOV_B64_DPP_PSEUDO opcode into a pair of v_mov_b32_dpp
// instructions. Returns a pair of generated instructions.
@@ -426,6 +425,9 @@ public:
void removeModOperands(MachineInstr &MI) const;
+ void mutateAndCleanupImplicit(MachineInstr &MI,
+ const MCInstrDesc &NewDesc) const;
+
/// Return the extracted immediate value in a subregister use from a constant
/// materialized in a super register.
///
@@ -583,6 +585,10 @@ public:
return get(Opcode).TSFlags & SIInstrFlags::MTBUF;
}
+ static bool isBUF(const MachineInstr &MI) {
+ return isMUBUF(MI) || isMTBUF(MI);
+ }
+
static bool isSMRD(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::SMRD;
}
@@ -688,11 +694,11 @@ public:
return get(Opcode).TSFlags & SIInstrFlags::FLAT;
}
- /// \returns true for SCRATCH_ instructions, or FLAT_ instructions with
- /// SCRATCH_ memory operands.
+ /// \returns true for SCRATCH_ instructions, or FLAT/BUF instructions unless
+ /// the MMOs do not include scratch.
/// Conservatively correct; will return true if \p MI cannot be proven
/// to not hit scratch.
- bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
+ bool mayAccessScratch(const MachineInstr &MI) const;
/// \returns true for FLAT instructions that can access VMEM.
bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
@@ -1174,13 +1180,13 @@ public:
bool isVGPRCopy(const MachineInstr &MI) const {
assert(isCopyInstr(MI));
Register Dest = MI.getOperand(0).getReg();
- const MachineFunction &MF = *MI.getParent()->getParent();
+ const MachineFunction &MF = *MI.getMF();
const MachineRegisterInfo &MRI = MF.getRegInfo();
return !RI.isSGPRReg(MRI, Dest);
}
bool hasVGPRUses(const MachineInstr &MI) const {
- const MachineFunction &MF = *MI.getParent()->getParent();
+ const MachineFunction &MF = *MI.getMF();
const MachineRegisterInfo &MRI = MF.getRegInfo();
return llvm::any_of(MI.explicit_uses(),
[&MRI, this](const MachineOperand &MO) {
@@ -1622,10 +1628,6 @@ public:
/// Return true if this opcode should not be used by codegen.
bool isAsmOnlyOpcode(int MCOp) const;
- const TargetRegisterClass *
- getRegClass(const MCInstrDesc &TID, unsigned OpNum,
- const TargetRegisterInfo *TRI) const override;
-
void fixImplicitOperands(MachineInstr &MI) const;
MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
@@ -1655,6 +1657,7 @@ public:
const TargetSchedModel &getSchedModel() const { return SchedModel; }
+ // FIXME: This should be removed
// Enforce operand's \p OpName even alignment if required by target.
// This is used if an operand is a 32 bit register but needs to be aligned
// regardless.
@@ -1687,7 +1690,7 @@ TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI,
/// skipping copy like instructions and subreg-manipulation pseudos.
/// Following another subreg of a reg:subreg isn't supported.
MachineInstr *getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
- MachineRegisterInfo &MRI);
+ const MachineRegisterInfo &MRI);
/// \brief Return false if EXEC is not changed between the def of \p VReg at \p
/// DefMI and the use at \p UseMI. Should be run on SSA. Currently does not
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index b7f63ec..628b972 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -6,13 +6,6 @@
//
//===----------------------------------------------------------------------===//
-def isWave32 : Predicate<"Subtarget->isWave32()">,
- AssemblerPredicate <(any_of FeatureWavefrontSize32,
- FeatureAssemblerPermissiveWavesize)>;
-def isWave64 : Predicate<"Subtarget->isWave64()">,
- AssemblerPredicate <(any_of FeatureWavefrontSize64,
- FeatureAssemblerPermissiveWavesize)>;
-
class AMDGPUMnemonicAlias<string From, string To, string VariantName = "">
: MnemonicAlias<From, To, VariantName>, PredicateControl;
@@ -57,6 +50,8 @@ def GFX10Gen : GFXGen<isGFX10Only, "GFX10", "_gfx10", SIEncodingFamily.GFX10>;
// SI DAG Nodes
//===----------------------------------------------------------------------===//
+// CLAMP value between 0.0 and 1.0. NaN clamped to 0, following clamp output
+// modifier behavior with dx10_enable.
def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>;
def SDTSBufferLoad : SDTypeProfile<1, 3,
@@ -331,7 +326,7 @@ def mfma_f32_32x32x64_f8f6f4 : UnscaledMFMAOptimizationPat<int_amdgcn_mfma_scale
//===----------------------------------------------------------------------===//
class isIntType<ValueType SrcVT> {
- bit ret = !and(SrcVT.isInteger, !ne(SrcVT.Value, i1.Value));
+ bit ret = !and(SrcVT.isInteger, !ne(SrcVT, i1));
}
def SDTSBufferPrefetch : SDTypeProfile<0, 3,
@@ -776,11 +771,7 @@ def xnor : PatFrag <
foreach I = 1-4 in {
def shl#I#_add : PatFrag <
(ops node:$src0, node:$src1),
- (add (shl_oneuse $src0, (i32 I)), $src1)> {
- // FIXME: Poor substitute for disabling pattern in SelectionDAG
- let PredicateCode = [{return false;}];
- let GISelPredicateCode = [{return true;}];
-}
+ (add (shl_oneuse $src0, (i32 I)), $src1)>;
}
multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0,
@@ -818,6 +809,8 @@ defm atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">;
defm atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">;
defm atomic_load_uinc_wrap : SIAtomicM0Glue2 <"LOAD_UINC_WRAP">;
defm atomic_load_udec_wrap : SIAtomicM0Glue2 <"LOAD_UDEC_WRAP">;
+defm atomic_load_usub_cond : SIAtomicM0Glue2 <"LOAD_USUB_COND">;
+defm atomic_load_usub_sat : SIAtomicM0Glue2 <"LOAD_USUB_SAT">;
defm atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">;
defm atomic_load_min : SIAtomicM0Glue2 <"LOAD_MIN">;
defm atomic_load_max : SIAtomicM0Glue2 <"LOAD_MAX">;
@@ -1796,10 +1789,10 @@ class SIMCInstr <string pseudo, int subtarget> {
class getNumSrcArgs<ValueType Src0, ValueType Src1, ValueType Src2> {
int ret =
- !if (!eq(Src0.Value, untyped.Value), 0,
- !if (!eq(Src1.Value, untyped.Value), 1, // VOP1
- !if (!eq(Src2.Value, untyped.Value), 2, // VOP2
- 3))); // VOP3
+ !if (!eq(Src0, untyped), 0,
+ !if (!eq(Src1, untyped), 1, // VOP1
+ !if (!eq(Src2, untyped), 2, // VOP2
+ 3))); // VOP3
}
// Returns the register class to use for the destination of VOP[123C]
@@ -1868,17 +1861,17 @@ class getVCSrcForVT<ValueType VT> {
!if(VT.isFP,
!if(!eq(VT.Size, 64),
VCSrc_f64,
- !cond(!eq(VT.Value, f16.Value) : VCSrc_f16,
- !eq(VT.Value, bf16.Value) : VCSrc_bf16,
- !eq(VT.Value, v2f16.Value) : VCSrc_v2f16,
- !eq(VT.Value, v2bf16.Value) : VCSrc_v2bf16,
+ !cond(!eq(VT, f16) : VCSrc_f16,
+ !eq(VT, bf16) : VCSrc_bf16,
+ !eq(VT, v2f16) : VCSrc_v2f16,
+ !eq(VT, v2bf16) : VCSrc_v2bf16,
1 : VCSrc_f32)
),
!if(!eq(VT.Size, 64),
VCSrc_b64,
- !if(!eq(VT.Value, i16.Value),
+ !if(!eq(VT, i16),
VCSrc_b16,
- !if(!eq(VT.Value, v2i16.Value),
+ !if(!eq(VT, v2i16),
VCSrc_v2b16,
VCSrc_b32
)
@@ -2003,28 +1996,28 @@ class getVOP3DPPSrcForVT<ValueType VT, bit IsFake16 = 1> {
// Float or packed int
class isModifierType<ValueType SrcVT> {
- bit ret = !or(!eq(SrcVT.Value, f16.Value),
- !eq(SrcVT.Value, bf16.Value),
- !eq(SrcVT.Value, f32.Value),
- !eq(SrcVT.Value, f64.Value),
- !eq(SrcVT.Value, v2f16.Value),
- !eq(SrcVT.Value, v2i16.Value),
- !eq(SrcVT.Value, v2bf16.Value),
- !eq(SrcVT.Value, v2f32.Value),
- !eq(SrcVT.Value, v2i32.Value),
- !eq(SrcVT.Value, v4f16.Value),
- !eq(SrcVT.Value, v4i16.Value),
- !eq(SrcVT.Value, v4bf16.Value),
- !eq(SrcVT.Value, v4f32.Value),
- !eq(SrcVT.Value, v4i32.Value),
- !eq(SrcVT.Value, v8f16.Value),
- !eq(SrcVT.Value, v8i16.Value),
- !eq(SrcVT.Value, v8bf16.Value),
- !eq(SrcVT.Value, v8f32.Value),
- !eq(SrcVT.Value, v8i32.Value),
- !eq(SrcVT.Value, v16f16.Value),
- !eq(SrcVT.Value, v16i16.Value),
- !eq(SrcVT.Value, v16bf16.Value));
+ bit ret = !or(!eq(SrcVT, f16),
+ !eq(SrcVT, bf16),
+ !eq(SrcVT, f32),
+ !eq(SrcVT, f64),
+ !eq(SrcVT, v2f16),
+ !eq(SrcVT, v2i16),
+ !eq(SrcVT, v2bf16),
+ !eq(SrcVT, v2f32),
+ !eq(SrcVT, v2i32),
+ !eq(SrcVT, v4f16),
+ !eq(SrcVT, v4i16),
+ !eq(SrcVT, v4bf16),
+ !eq(SrcVT, v4f32),
+ !eq(SrcVT, v4i32),
+ !eq(SrcVT, v8f16),
+ !eq(SrcVT, v8i16),
+ !eq(SrcVT, v8bf16),
+ !eq(SrcVT, v8f32),
+ !eq(SrcVT, v8i32),
+ !eq(SrcVT, v16f16),
+ !eq(SrcVT, v16i16),
+ !eq(SrcVT, v16bf16));
}
// Return type of input modifiers operand for specified input operand.
@@ -2057,9 +2050,9 @@ class getSrcModDPP <ValueType VT> {
class getSrcModDPP_t16 <ValueType VT, bit IsFake16 = 1> {
Operand ret =
!if (VT.isFP,
- !if (!or(!eq(VT.Value, f16.Value), !eq(VT.Value, bf16.Value)),
+ !if (!or(!eq(VT, f16), !eq(VT, bf16)),
FPT16_Lo128VRegInputMods<IsFake16>, FPVRegInputMods),
- !if (!eq(VT.Value, i16.Value),
+ !if (!eq(VT, i16),
IntT16_Lo128VRegInputMods<IsFake16>, IntVRegInputMods));
}
@@ -2068,11 +2061,11 @@ class getSrcModDPP_t16 <ValueType VT, bit IsFake16 = 1> {
class getSrcModVOP3VC <ValueType VT, bit IsFake16 = 1> {
Operand ret =
!if (VT.isFP,
- !if (!or(!eq(VT.Value, f16.Value), !eq(VT.Value, bf16.Value)),
+ !if (!or(!eq(VT, f16), !eq(VT, bf16)),
FPT16VCSrcInputMods<IsFake16>,
- !if (!eq(VT.Value, f64.Value), FP64VCSrcInputMods,
+ !if (!eq(VT, f64), FP64VCSrcInputMods,
FP32VCSrcInputMods)),
- !if (!eq(VT.Value, i16.Value),
+ !if (!eq(VT, i16),
IntT16VCSrcInputMods<IsFake16>,
Int32VCSrcInputMods));
}
@@ -2084,15 +2077,15 @@ class getSrcModVOP3VC <ValueType VT, bit IsFake16 = 1> {
class getSrc0ModVOP3DPP <ValueType VT, ValueType DstVT, bit IsFake16 = 1> {
defvar T16Dst =
!if (VT.isFP,
- !if (!or(!eq(VT.Value, f16.Value), !eq(VT.Value, bf16.Value)),
+ !if (!or(!eq(VT, f16), !eq(VT, bf16)),
FPT16VRegInputMods<IsFake16>, FPVRegT16DstInputMods),
- !if (!eq(VT.Value, i16.Value), IntT16VRegInputMods<IsFake16>,
+ !if (!eq(VT, i16), IntT16VRegInputMods<IsFake16>,
IntVRegT16DstInputMods));
defvar Normal =
!if (VT.isFP,
- !if (!or(!eq(VT.Value, f16.Value), !eq(VT.Value, bf16.Value)),
+ !if (!or(!eq(VT, f16), !eq(VT, bf16)),
FPT16VRegInputMods<IsFake16>, FPVRegInputMods),
- !if (!eq(VT.Value, i16.Value),
+ !if (!eq(VT, i16),
IntT16VRegInputMods<IsFake16>,
IntVRegInputMods));
Operand ret = !if(!and(!not(IsFake16), !eq(DstVT.Size, 16)), T16Dst, Normal);
@@ -2102,16 +2095,16 @@ class getSrc0ModVOP3DPP <ValueType VT, ValueType DstVT, bit IsFake16 = 1> {
// only operands (VOPD3 vsrc1 and vsrc2).
class getSrcModVOP3V <ValueType VT> {
Operand ret =
- !if (!eq(VT.Value, f64.Value), FP64VRegSrcInputMods,
+ !if (!eq(VT, f64), FP64VRegSrcInputMods,
FP32VRegSrcInputMods);
}
// Return type of input modifiers operand specified input operand for SDWA
class getSrcModSDWA <ValueType VT> {
- Operand ret = !if(!eq(VT.Value, f16.Value), FP16SDWAInputMods,
- !if(!eq(VT.Value, f32.Value), FP32SDWAInputMods,
- !if(!eq(VT.Value, i16.Value), Int16SDWAInputMods,
- !if(!eq(VT.Value, bf16.Value), FP16SDWAInputMods,
+ Operand ret = !if(!eq(VT, f16), FP16SDWAInputMods,
+ !if(!eq(VT, f32), FP32SDWAInputMods,
+ !if(!eq(VT, i16), Int16SDWAInputMods,
+ !if(!eq(VT, bf16), FP16SDWAInputMods,
Int32SDWAInputMods))));
}
@@ -2778,14 +2771,14 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
field bit HasFP8ByteSel = !or(HasFP8SrcByteSel, HasFP8DstByteSel);
field bit HasBitOp3 = 0;
- field bit HasDst = !ne(DstVT.Value, untyped.Value);
+ field bit HasDst = !ne(DstVT, untyped);
field bit HasDst32 = HasDst;
field bit EmitDst = HasDst; // force dst encoding, see v_movreld_b32 special case
field bit EmitDstSel = EmitDst;
field int NumSrcArgs = getNumSrcArgs<Src0VT, Src1VT, Src2VT>.ret;
- field bit HasSrc0 = !ne(Src0VT.Value, untyped.Value);
- field bit HasSrc1 = !ne(Src1VT.Value, untyped.Value);
- field bit HasSrc2 = !ne(Src2VT.Value, untyped.Value);
+ field bit HasSrc0 = !ne(Src0VT, untyped);
+ field bit HasSrc1 = !ne(Src1VT, untyped);
+ field bit HasSrc2 = !ne(Src2VT, untyped);
field bit HasSrc0FloatMods = Src0VT.isFP;
field bit HasSrc1FloatMods = Src1VT.isFP;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 6f1feb1..984d1a4 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -348,7 +348,7 @@ class WaveReduceOp<string OpName, string TypeStr, ValueType Ty,
// Input list : [Operation_name,
// type - Signed(I)/Unsigned(U)/Float(F)/Bitwise(B),
-// bit-width
+// input-type
// output register class,
// input register class]
defvar Operations = [
@@ -371,6 +371,11 @@ defvar Operations = [
WaveReduceOp<"and", "B64", i64, SGPR_64, VSrc_b64>,
WaveReduceOp<"or", "B64", i64, SGPR_64, VSrc_b64>,
WaveReduceOp<"xor", "B64", i64, SGPR_64, VSrc_b64>,
+
+ WaveReduceOp<"fmin", "F32", f32, SGPR_32, VSrc_b32>,
+ WaveReduceOp<"fmax", "F32", f32, SGPR_32, VSrc_b32>,
+ WaveReduceOp<"fadd", "F32", f32, SGPR_32, VSrc_b32>,
+ WaveReduceOp<"fsub", "F32", f32, SGPR_32, VSrc_b32>,
];
foreach Op = Operations in {
@@ -791,6 +796,17 @@ def : GCNPat<
(SI_CALL_ISEL $src0, (i64 0))
>;
+// Funnel shift right (fshr) patterns for uniform inputs.
+// These patterns implement this using scalar instructions by constructing a 64-bit
+// value {a, b} and performing a single right shift.
+def : GCNPat<(UniformTernaryFrag<fshr> i32:$src0, i32:$src1, i32:$src2),
+ (i32 (EXTRACT_SUBREG (S_LSHR_B64 (REG_SEQUENCE SReg_64, $src1, sub0, $src0, sub1), (S_AND_B32 $src2, (i32 31))), sub0))
+>;
+
+def : GCNPat<(UniformTernaryFrag<fshr> i32:$src0, i32:$src1, (i32 ShiftAmt32Imm:$src2)),
+ (i32 (EXTRACT_SUBREG (S_LSHR_B64 (REG_SEQUENCE SReg_64, $src1, sub0, $src0, sub1), $src2), sub0))
+>;
+
// Wrapper around s_swappc_b64 with extra $callee parameter to track
// the called function after regalloc.
def SI_CALL : SPseudoInstSI <
@@ -804,9 +820,8 @@ def SI_CALL : SPseudoInstSI <
let isConvergent = 1;
}
-class SI_TCRETURN_Pseudo<RegisterClass rc, SDNode sd> : SPseudoInstSI <(outs),
- (ins rc:$src0, unknown:$callee, i32imm:$fpdiff),
- [(sd i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]> {
+class SI_TCRETURN_Pseudo<RegisterClass rc, list<dag> pattern = []>
+ : SPseudoInstSI <(outs), (ins rc:$src0, unknown:$callee, i32imm:$fpdiff), pattern> {
let Size = 4;
let FixedSize = 1;
let isCall = 1;
@@ -820,8 +835,15 @@ class SI_TCRETURN_Pseudo<RegisterClass rc, SDNode sd> : SPseudoInstSI <(outs),
}
// Tail call handling pseudo
-def SI_TCRETURN : SI_TCRETURN_Pseudo<CCR_SGPR_64, AMDGPUtc_return>;
-def SI_TCRETURN_GFX : SI_TCRETURN_Pseudo<Gfx_CCR_SGPR_64, AMDGPUtc_return_gfx>;
+def SI_TCRETURN : SI_TCRETURN_Pseudo<CCR_SGPR_64,
+ [(AMDGPUtc_return i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]>;
+def SI_TCRETURN_GFX : SI_TCRETURN_Pseudo<Gfx_CCR_SGPR_64,
+ [(AMDGPUtc_return_gfx i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]>;
+
+// Tail call for chain calling conventions.
+// Uses unrestricted SGPR_64 instead of CCR_SGPR_64 because chain calls
+// never return and don't need to preserve any SGPRs.
+def SI_TCRETURN_CHAIN : SI_TCRETURN_Pseudo<SGPR_64>;
// Handle selecting indirect tail calls
def : GCNPat<
@@ -851,13 +873,13 @@ multiclass SI_CS_CHAIN_TC<
// This is essentially a tail call, but it also takes a mask to put in EXEC
// right before jumping to the callee.
def NAME: SPseudoInstSI <(outs),
- (ins CCR_SGPR_64:$src0, unknown:$callee, i32imm:$fpdiff, execrc:$exec)>;
+ (ins SGPR_64:$src0, unknown:$callee, i32imm:$fpdiff, execrc:$exec)>;
// Same as above, but it will first try to reallocate the VGPRs, and choose an
// EXEC mask and a callee depending on the success of the reallocation attempt.
def _DVGPR : SPseudoInstSI <(outs),
- (ins CCR_SGPR_64:$src0, i64imm:$callee, i32imm:$fpdiff, execrc:$exec,
- SSrc_b32:$numvgprs, execrc:$fbexec, CCR_SGPR_64:$fbcallee)>;
+ (ins SGPR_64:$src0, i64imm:$callee, i32imm:$fpdiff, execrc:$exec,
+ SSrc_b32:$numvgprs, execrc:$fbexec, SGPR_64:$fbcallee)>;
} // End FixedSize = 0 etc
}
@@ -869,7 +891,7 @@ multiclass si_cs_chain_tc_pattern<
dag callee, ValueType execvt, RegisterOperand execrc, Instruction tc> {
def : GCNPat<
(AMDGPUtc_return_chain i64:$src0, callee, (i32 timm:$fpdiff), execvt:$exec),
- (tc CCR_SGPR_64:$src0, callee, i32imm:$fpdiff, execrc:$exec)
+ (tc SGPR_64:$src0, callee, i32imm:$fpdiff, execrc:$exec)
>;
}
@@ -896,8 +918,8 @@ multiclass si_cs_chain_tc_dvgpr_patterns<
(AMDGPUtc_return_chain_dvgpr i64:$src0, callee, (i32 timm:$fpdiff),
execvt:$exec, i32:$numvgprs,
execvt:$fbexec, i64:$fbcallee),
- (tc CCR_SGPR_64:$src0, (i64 0), i32imm:$fpdiff, execrc:$exec,
- SSrc_b32:$numvgprs, execrc:$fbexec, CCR_SGPR_64:$fbcallee)
+ (tc SGPR_64:$src0, (i64 0), i32imm:$fpdiff, execrc:$exec,
+ SSrc_b32:$numvgprs, execrc:$fbexec, SGPR_64:$fbcallee)
>;
}
}
@@ -1429,7 +1451,7 @@ def : GCNPat <
// Don't allow source modifiers. If there are any source modifiers then it's
// better to select fma instead of fmac.
-let SubtargetPredicate = HasFmaLegacy32 in
+let SubtargetPredicate = HasFmacLegacy32 in
def : GCNPat <
(f32 (int_amdgcn_fma_legacy (VOP3NoMods f32:$src0),
(VOP3NoMods f32:$src1),
@@ -2663,8 +2685,6 @@ def : AMDGPUPat <
let True16Predicate = NotHasTrue16BitInsts in {
let SubtargetPredicate = isNotGFX9Plus in {
-def : ROTRPattern <V_ALIGNBIT_B32_e64>;
-
def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (and i32:$src1, (i32 31))))),
(V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
(i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
@@ -2675,14 +2695,6 @@ def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (i32 ShiftAmt32Imm:
} // isNotGFX9Plus
let SubtargetPredicate = isGFX9GFX10 in {
-def : GCNPat <
- (rotr i32:$src0, i32:$src1),
- (V_ALIGNBIT_B32_opsel_e64 /* src0_modifiers */ 0, $src0,
- /* src1_modifiers */ 0, $src0,
- /* src2_modifiers */ 0,
- $src1, /* clamp */ 0, /* op_sel */ 0)
->;
-
foreach pat = [(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (and i32:$src1, (i32 31))))),
(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (i32 ShiftAmt32Imm:$src1))))] in
def : GCNPat<pat,
@@ -2704,15 +2716,6 @@ def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2),
} // end True16Predicate = NotHasTrue16BitInsts
let True16Predicate = UseRealTrue16Insts in {
-def : GCNPat <
- (rotr i32:$src0, i32:$src1),
- (V_ALIGNBIT_B32_t16_e64 /* src0_modifiers */ 0, $src0,
- /* src1_modifiers */ 0, $src0,
- /* src2_modifiers */ 0,
- (EXTRACT_SUBREG $src1, lo16),
- /* clamp */ 0, /* op_sel */ 0)
->;
-
def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
(V_ALIGNBIT_B32_t16_e64 0, /* src0_modifiers */
(i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
@@ -2731,14 +2734,6 @@ def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2),
} // end True16Predicate = UseRealTrue16Insts
let True16Predicate = UseFakeTrue16Insts in {
-def : GCNPat <
- (rotr i32:$src0, i32:$src1),
- (V_ALIGNBIT_B32_fake16_e64 /* src0_modifiers */ 0, $src0,
- /* src1_modifiers */ 0, $src0,
- /* src2_modifiers */ 0,
- $src1, /* clamp */ 0, /* op_sel */ 0)
->;
-
def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (and i32:$src1, (i32 31))))),
(V_ALIGNBIT_B32_fake16_e64 0, /* src0_modifiers */
(i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
@@ -2930,15 +2925,25 @@ def : GCNPat <
>;
def : GCNPat <
- (i64 (zext i32:$src)),
+ (i64 (UniformUnaryFrag<zext> i32:$src)),
(REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 (i32 0)), sub1)
>;
def : GCNPat <
- (i64 (anyext i32:$src)),
+ (i64 (zext i32:$src)),
+ (REG_SEQUENCE VReg_64, $src, sub0, (V_MOV_B32_e32 (i32 0)), sub1)
+>;
+
+def : GCNPat <
+ (i64 (UniformUnaryFrag<anyext> i32:$src)),
(REG_SEQUENCE SReg_64, $src, sub0, (i32 (IMPLICIT_DEF)), sub1)
>;
+def : GCNPat <
+ (i64 (anyext i32:$src)),
+ (REG_SEQUENCE VReg_64, $src, sub0, (i32 (IMPLICIT_DEF)), sub1)
+>;
+
class ZExt_i64_i1_Pat <SDNode ext> : GCNPat <
(i64 (ext i1:$src)),
(REG_SEQUENCE VReg_64,
@@ -4527,6 +4532,7 @@ def G_AMDGPU_BUFFER_ATOMIC_SMAX : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_UMAX : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_AND : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32 : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32 : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_OR : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction;
@@ -4725,3 +4731,14 @@ def V_ILLEGAL : Enc32, InstSI<(outs), (ins), "v_illegal"> {
let hasSideEffects = 1;
let SubtargetPredicate = isGFX10Plus;
}
+
+defvar VGPR32_Ptr_Opcodes = [LOAD_STACK_GUARD];
+defvar VGPR64_Ptr_Opcodes = !listremove(PseudosWithPtrOps, VGPR32_Ptr_Opcodes);
+
+foreach inst = VGPR32_Ptr_Opcodes in {
+ def : RemapPointerOperands<inst, VGPR_32>;
+}
+
+foreach inst = VGPR64_Ptr_Opcodes in {
+ def : RemapPointerOperands<inst, VReg_64_AlignTarget>;
+}
diff --git a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
index 6537b79..340c9f6 100644
--- a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
@@ -186,7 +186,7 @@ void SILateBranchLowering::expandChainCall(MachineInstr &MI,
for (int OpIdx = MI.getNumExplicitOperands() - 1; OpIdx >= ExecIdx; --OpIdx)
MI.removeOperand(OpIdx);
- MI.setDesc(TII->get(AMDGPU::SI_TCRETURN));
+ MI.setDesc(TII->get(AMDGPU::SI_TCRETURN_CHAIN));
}
void SILateBranchLowering::earlyTerm(MachineInstr &MI,
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index f0d1117..fcf91e0 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -233,10 +233,11 @@ private:
void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired,
MachineBasicBlock::iterator InsertBefore,
- AMDGPU::OpName OpName, Register DestReg) const;
+ const DebugLoc &DL, AMDGPU::OpName OpName,
+ Register DestReg) const;
Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
MachineBasicBlock::iterator InsertBefore,
- AMDGPU::OpName OpName) const;
+ const DebugLoc &DL, AMDGPU::OpName OpName) const;
unsigned read2Opcode(unsigned EltSize) const;
unsigned read2ST64Opcode(unsigned EltSize) const;
@@ -1336,11 +1337,9 @@ SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
int Data1Idx = AMDGPU::getNamedOperandIdx(Write2Opc.getOpcode(),
AMDGPU::OpName::data1);
- const TargetRegisterClass *DataRC0 =
- TII->getRegClass(Write2Opc, Data0Idx, TRI);
+ const TargetRegisterClass *DataRC0 = TII->getRegClass(Write2Opc, Data0Idx);
- const TargetRegisterClass *DataRC1 =
- TII->getRegClass(Write2Opc, Data1Idx, TRI);
+ const TargetRegisterClass *DataRC1 = TII->getRegClass(Write2Opc, Data1Idx);
if (unsigned SubReg = Data0->getSubReg()) {
DataRC0 = TRI->getMatchingSuperRegClass(MRI->getRegClass(Data0->getReg()),
@@ -1367,10 +1366,9 @@ SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
// Paired.
void SILoadStoreOptimizer::copyToDestRegs(
CombineInfo &CI, CombineInfo &Paired,
- MachineBasicBlock::iterator InsertBefore, AMDGPU::OpName OpName,
- Register DestReg) const {
+ MachineBasicBlock::iterator InsertBefore, const DebugLoc &DL,
+ AMDGPU::OpName OpName, Register DestReg) const {
MachineBasicBlock *MBB = CI.I->getParent();
- DebugLoc DL = CI.I->getDebugLoc();
auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
@@ -1398,9 +1396,9 @@ void SILoadStoreOptimizer::copyToDestRegs(
Register
SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
MachineBasicBlock::iterator InsertBefore,
+ const DebugLoc &DL,
AMDGPU::OpName OpName) const {
MachineBasicBlock *MBB = CI.I->getParent();
- DebugLoc DL = CI.I->getDebugLoc();
auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
@@ -1456,7 +1454,8 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
Register DestReg = MRI->createVirtualRegister(SuperRC);
- DebugLoc DL = CI.I->getDebugLoc();
+ DebugLoc DL =
+ DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
Register BaseReg = AddrReg->getReg();
unsigned BaseSubReg = AddrReg->getSubReg();
@@ -1484,7 +1483,7 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
.addImm(0) // gds
.cloneMergedMemRefs({&*CI.I, &*Paired.I});
- copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
+ copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdst, DestReg);
CI.I->eraseFromParent();
Paired.I->eraseFromParent();
@@ -1541,7 +1540,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
(NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
const MCInstrDesc &Write2Desc = TII->get(Opc);
- DebugLoc DL = CI.I->getDebugLoc();
+ DebugLoc DL =
+ DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
Register BaseReg = AddrReg->getReg();
unsigned BaseSubReg = AddrReg->getSubReg();
@@ -1582,7 +1582,9 @@ MachineBasicBlock::iterator
SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
MachineBasicBlock::iterator InsertBefore) {
MachineBasicBlock *MBB = CI.I->getParent();
- DebugLoc DL = CI.I->getDebugLoc();
+ DebugLoc DL =
+ DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
+
const unsigned Opcode = getNewOpcode(CI, Paired);
const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
@@ -1607,7 +1609,7 @@ SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
- copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
+ copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata, DestReg);
CI.I->eraseFromParent();
Paired.I->eraseFromParent();
@@ -1618,7 +1620,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
CombineInfo &CI, CombineInfo &Paired,
MachineBasicBlock::iterator InsertBefore) {
MachineBasicBlock *MBB = CI.I->getParent();
- DebugLoc DL = CI.I->getDebugLoc();
+ DebugLoc DL =
+ DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
+
const unsigned Opcode = getNewOpcode(CI, Paired);
const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
@@ -1639,7 +1643,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
New.addImm(MergedOffset);
New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
- copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::sdst, DestReg);
+ copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::sdst, DestReg);
CI.I->eraseFromParent();
Paired.I->eraseFromParent();
@@ -1650,7 +1654,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
CombineInfo &CI, CombineInfo &Paired,
MachineBasicBlock::iterator InsertBefore) {
MachineBasicBlock *MBB = CI.I->getParent();
- DebugLoc DL = CI.I->getDebugLoc();
+
+ DebugLoc DL =
+ DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
const unsigned Opcode = getNewOpcode(CI, Paired);
@@ -1680,7 +1686,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
.addImm(0) // swz
.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
- copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
+ copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata, DestReg);
CI.I->eraseFromParent();
Paired.I->eraseFromParent();
@@ -1691,7 +1697,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
CombineInfo &CI, CombineInfo &Paired,
MachineBasicBlock::iterator InsertBefore) {
MachineBasicBlock *MBB = CI.I->getParent();
- DebugLoc DL = CI.I->getDebugLoc();
+
+ DebugLoc DL =
+ DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
const unsigned Opcode = getNewOpcode(CI, Paired);
@@ -1731,7 +1739,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
.addImm(0) // swz
.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
- copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
+ copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata, DestReg);
CI.I->eraseFromParent();
Paired.I->eraseFromParent();
@@ -1742,12 +1750,13 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
CombineInfo &CI, CombineInfo &Paired,
MachineBasicBlock::iterator InsertBefore) {
MachineBasicBlock *MBB = CI.I->getParent();
- DebugLoc DL = CI.I->getDebugLoc();
+ DebugLoc DL =
+ DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
const unsigned Opcode = getNewOpcode(CI, Paired);
Register SrcReg =
- copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
+ copyFromSrcRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata);
auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
.addReg(SrcReg, RegState::Kill);
@@ -1789,7 +1798,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
CombineInfo &CI, CombineInfo &Paired,
MachineBasicBlock::iterator InsertBefore) {
MachineBasicBlock *MBB = CI.I->getParent();
- DebugLoc DL = CI.I->getDebugLoc();
+
+ DebugLoc DL =
+ DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
const unsigned Opcode = getNewOpcode(CI, Paired);
@@ -1807,7 +1818,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
.addImm(CI.CPol)
.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
- copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
+ copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdst, DestReg);
CI.I->eraseFromParent();
Paired.I->eraseFromParent();
@@ -1818,12 +1829,14 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
CombineInfo &CI, CombineInfo &Paired,
MachineBasicBlock::iterator InsertBefore) {
MachineBasicBlock *MBB = CI.I->getParent();
- DebugLoc DL = CI.I->getDebugLoc();
+
+ DebugLoc DL =
+ DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
const unsigned Opcode = getNewOpcode(CI, Paired);
Register SrcReg =
- copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
+ copyFromSrcRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata);
auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
@@ -2094,12 +2107,13 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
CombineInfo &CI, CombineInfo &Paired,
MachineBasicBlock::iterator InsertBefore) {
MachineBasicBlock *MBB = CI.I->getParent();
- DebugLoc DL = CI.I->getDebugLoc();
+ DebugLoc DL =
+ DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
const unsigned Opcode = getNewOpcode(CI, Paired);
Register SrcReg =
- copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
+ copyFromSrcRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata);
auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
.addReg(SrcReg, RegState::Kill);
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index 96131bd..9b71001 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -75,7 +75,7 @@ Vreg1LoweringHelper::Vreg1LoweringHelper(MachineFunction *MF,
bool Vreg1LoweringHelper::cleanConstrainRegs(bool Changed) {
assert(Changed || ConstrainRegs.empty());
for (Register Reg : ConstrainRegs)
- MRI->constrainRegClass(Reg, &AMDGPU::SReg_1_XEXECRegClass);
+ MRI->constrainRegClass(Reg, TII->getRegisterInfo().getWaveMaskRegClass());
ConstrainRegs.clear();
return Changed;
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 40eeeb8..cbd08f0 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -117,27 +117,26 @@ static void insertCSRSaves(MachineBasicBlock &SaveBlock,
MachineFunction &MF = *SaveBlock.getParent();
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
- const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *RI = ST.getRegisterInfo();
MachineBasicBlock::iterator I = SaveBlock.begin();
- if (!TFI->spillCalleeSavedRegisters(SaveBlock, I, CSI, TRI)) {
+ if (!TFI->spillCalleeSavedRegisters(SaveBlock, I, CSI, RI)) {
for (const CalleeSavedInfo &CS : CSI) {
// Insert the spill to the stack frame.
MCRegister Reg = CS.getReg();
MachineInstrSpan MIS(I, &SaveBlock);
- const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(
+ const TargetRegisterClass *RC = RI->getMinimalPhysRegClass(
Reg, Reg == RI->getReturnAddressReg(MF) ? MVT::i64 : MVT::i32);
// If this value was already livein, we probably have a direct use of the
// incoming register value, so don't kill at the spill point. This happens
// since we pass some special inputs (workgroup IDs) in the callee saved
// range.
- const bool IsLiveIn = isLiveIntoMBB(Reg, SaveBlock, TRI);
+ const bool IsLiveIn = isLiveIntoMBB(Reg, SaveBlock, RI);
TII.storeRegToStackSlot(SaveBlock, I, Reg, !IsLiveIn, CS.getFrameIdx(),
- RC, TRI, Register());
+ RC, Register());
if (Indexes) {
assert(std::distance(MIS.begin(), I) == 1);
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index b398db4..9abda27 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -696,7 +696,6 @@ convertArgumentInfo(const AMDGPUFunctionArgInfo &ArgInfo,
return true;
};
- // TODO: Need to serialize kernarg preloads.
bool Any = false;
Any |= convertArg(AI.PrivateSegmentBuffer, ArgInfo.PrivateSegmentBuffer);
Any |= convertArg(AI.DispatchPtr, ArgInfo.DispatchPtr);
@@ -718,6 +717,21 @@ convertArgumentInfo(const AMDGPUFunctionArgInfo &ArgInfo,
Any |= convertArg(AI.WorkItemIDY, ArgInfo.WorkItemIDY);
Any |= convertArg(AI.WorkItemIDZ, ArgInfo.WorkItemIDZ);
+ // Write FirstKernArgPreloadReg separately, since it's a Register,
+ // not ArgDescriptor.
+ if (ArgInfo.FirstKernArgPreloadReg) {
+ Register Reg = ArgInfo.FirstKernArgPreloadReg;
+ assert(Reg.isPhysical() &&
+ "FirstKernArgPreloadReg must be a physical register");
+
+ yaml::SIArgument SA = yaml::SIArgument::createArgument(true);
+ raw_string_ostream OS(SA.RegisterName.Value);
+ OS << printReg(Reg, &TRI);
+
+ AI.FirstKernArgPreloadReg = SA;
+ Any = true;
+ }
+
if (Any)
return AI;
@@ -750,7 +764,8 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
Mode(MFI.getMode()), HasInitWholeWave(MFI.hasInitWholeWave()),
IsWholeWaveFunction(MFI.isWholeWaveFunction()),
DynamicVGPRBlockSize(MFI.getDynamicVGPRBlockSize()),
- ScratchReservedForDynamicVGPRs(MFI.getScratchReservedForDynamicVGPRs()) {
+ ScratchReservedForDynamicVGPRs(MFI.getScratchReservedForDynamicVGPRs()),
+ NumKernargPreloadSGPRs(MFI.getNumKernargPreloadedSGPRs()) {
for (Register Reg : MFI.getSGPRSpillPhysVGPRs())
SpillPhysVGPRS.push_back(regToString(Reg, TRI));
@@ -799,6 +814,8 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
ReturnsVoid = YamlMFI.ReturnsVoid;
IsWholeWaveFunction = YamlMFI.IsWholeWaveFunction;
+ UserSGPRInfo.allocKernargPreloadSGPRs(YamlMFI.NumKernargPreloadSGPRs);
+
if (YamlMFI.ScavengeFI) {
auto FIOrErr = YamlMFI.ScavengeFI->getFI(MF.getFrameInfo());
if (!FIOrErr) {
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 019c3b7..d901f4c 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -170,6 +170,7 @@ struct SIArgumentInfo {
std::optional<SIArgument> DispatchID;
std::optional<SIArgument> FlatScratchInit;
std::optional<SIArgument> PrivateSegmentSize;
+ std::optional<SIArgument> FirstKernArgPreloadReg;
std::optional<SIArgument> WorkGroupIDX;
std::optional<SIArgument> WorkGroupIDY;
@@ -195,6 +196,7 @@ template <> struct MappingTraits<SIArgumentInfo> {
YamlIO.mapOptional("dispatchID", AI.DispatchID);
YamlIO.mapOptional("flatScratchInit", AI.FlatScratchInit);
YamlIO.mapOptional("privateSegmentSize", AI.PrivateSegmentSize);
+ YamlIO.mapOptional("firstKernArgPreloadReg", AI.FirstKernArgPreloadReg);
YamlIO.mapOptional("workGroupIDX", AI.WorkGroupIDX);
YamlIO.mapOptional("workGroupIDY", AI.WorkGroupIDY);
@@ -305,6 +307,8 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
unsigned DynamicVGPRBlockSize = 0;
unsigned ScratchReservedForDynamicVGPRs = 0;
+ unsigned NumKernargPreloadSGPRs = 0;
+
SIMachineFunctionInfo() = default;
SIMachineFunctionInfo(const llvm::SIMachineFunctionInfo &,
const TargetRegisterInfo &TRI,
@@ -361,6 +365,7 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
YamlIO.mapOptional("dynamicVGPRBlockSize", MFI.DynamicVGPRBlockSize, false);
YamlIO.mapOptional("scratchReservedForDynamicVGPRs",
MFI.ScratchReservedForDynamicVGPRs, 0);
+ YamlIO.mapOptional("numKernargPreloadSGPRs", MFI.NumKernargPreloadSGPRs, 0);
YamlIO.mapOptional("isWholeWaveFunction", MFI.IsWholeWaveFunction, false);
}
};
@@ -1014,7 +1019,9 @@ public:
void setNumWaveDispatchVGPRs(unsigned Count) { NumWaveDispatchVGPRs = Count; }
Register getPrivateSegmentWaveByteOffsetSystemSGPR() const {
- return ArgInfo.PrivateSegmentWaveByteOffset.getRegister();
+ if (ArgInfo.PrivateSegmentWaveByteOffset)
+ return ArgInfo.PrivateSegmentWaveByteOffset.getRegister();
+ return MCRegister();
}
/// Returns the physical register reserved for use as the resource
diff --git a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
index fd28abe..2f3ad39 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -323,8 +323,8 @@ void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
// Do not Track Physical Registers, because it messes up.
for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
- if (RegMaskPair.RegUnit.isVirtual())
- LiveInRegs.insert(RegMaskPair.RegUnit);
+ if (RegMaskPair.VRegOrUnit.isVirtualReg())
+ LiveInRegs.insert(RegMaskPair.VRegOrUnit.asVirtualReg());
}
LiveOutRegs.clear();
// There is several possibilities to distinguish:
@@ -350,12 +350,13 @@ void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
// Comparing to LiveInRegs is not sufficient to differentiate 4 vs 5, 7
// The use of findDefBetween removes the case 4.
for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
- Register Reg = RegMaskPair.RegUnit;
- if (Reg.isVirtual() &&
- isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(),
+ VirtRegOrUnit VRegOrUnit = RegMaskPair.VRegOrUnit;
+ if (VRegOrUnit.isVirtualReg() &&
+ isDefBetween(VRegOrUnit.asVirtualReg(),
+ LIS->getInstructionIndex(*BeginBlock).getRegSlot(),
LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI,
LIS)) {
- LiveOutRegs.insert(Reg);
+ LiveOutRegs.insert(VRegOrUnit.asVirtualReg());
}
}
@@ -578,11 +579,11 @@ void SIScheduleBlock::printDebug(bool full) {
<< LiveOutPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n\n";
dbgs() << "LiveIns:\n";
for (Register Reg : LiveInRegs)
- dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
+ dbgs() << printReg(Reg, DAG->getTRI()) << ' ';
dbgs() << "\nLiveOuts:\n";
for (Register Reg : LiveOutRegs)
- dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
+ dbgs() << printReg(Reg, DAG->getTRI()) << ' ';
}
dbgs() << "\nInstructions:\n";
@@ -1446,23 +1447,24 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
}
#endif
- std::set<Register> InRegs = DAG->getInRegs();
+ std::set<VirtRegOrUnit> InRegs = DAG->getInRegs();
addLiveRegs(InRegs);
// Increase LiveOutRegsNumUsages for blocks
// producing registers consumed in another
// scheduling region.
- for (Register Reg : DAG->getOutRegs()) {
+ for (VirtRegOrUnit VRegOrUnit : DAG->getOutRegs()) {
for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
// Do reverse traversal
int ID = BlocksStruct.TopDownIndex2Block[Blocks.size()-1-i];
SIScheduleBlock *Block = Blocks[ID];
const std::set<Register> &OutRegs = Block->getOutRegs();
- if (OutRegs.find(Reg) == OutRegs.end())
+ if (!VRegOrUnit.isVirtualReg() ||
+ OutRegs.find(VRegOrUnit.asVirtualReg()) == OutRegs.end())
continue;
- ++LiveOutRegsNumUsages[ID][Reg];
+ ++LiveOutRegsNumUsages[ID][VRegOrUnit.asVirtualReg()];
break;
}
}
@@ -1565,15 +1567,18 @@ SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
maxVregUsage = VregCurrentUsage;
if (SregCurrentUsage > maxSregUsage)
maxSregUsage = SregCurrentUsage;
- LLVM_DEBUG(dbgs() << "Picking New Blocks\n"; dbgs() << "Available: ";
- for (SIScheduleBlock *Block : ReadyBlocks)
- dbgs() << Block->getID() << ' ';
- dbgs() << "\nCurrent Live:\n";
- for (Register Reg : LiveRegs)
- dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
- dbgs() << '\n';
- dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
- dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';);
+ LLVM_DEBUG({
+ dbgs() << "Picking New Blocks\n";
+ dbgs() << "Available: ";
+ for (SIScheduleBlock *Block : ReadyBlocks)
+ dbgs() << Block->getID() << ' ';
+ dbgs() << "\nCurrent Live:\n";
+ for (Register Reg : LiveRegs)
+ dbgs() << printReg(Reg, DAG->getTRI()) << ' ';
+ dbgs() << '\n';
+ dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
+ dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';
+ });
Cand.Block = nullptr;
for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(),
@@ -1625,13 +1630,13 @@ SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
// Tracking of currently alive registers to determine VGPR Usage.
-void SIScheduleBlockScheduler::addLiveRegs(std::set<Register> &Regs) {
- for (Register Reg : Regs) {
+void SIScheduleBlockScheduler::addLiveRegs(std::set<VirtRegOrUnit> &Regs) {
+ for (VirtRegOrUnit VRegOrUnit : Regs) {
// For now only track virtual registers.
- if (!Reg.isVirtual())
+ if (!VRegOrUnit.isVirtualReg())
continue;
// If not already in the live set, then add it.
- (void) LiveRegs.insert(Reg);
+ (void)LiveRegs.insert(VRegOrUnit.asVirtualReg());
}
}
@@ -1662,7 +1667,7 @@ void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) {
void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
decreaseLiveRegs(Block, Block->getInRegs());
- addLiveRegs(Block->getOutRegs());
+ LiveRegs.insert(Block->getOutRegs().begin(), Block->getOutRegs().end());
releaseBlockSuccs(Block);
for (const auto &RegP : LiveOutRegsNumUsages[Block->getID()]) {
// We produce this register, thus it must not be previously alive.
@@ -1689,7 +1694,7 @@ SIScheduleBlockScheduler::checkRegUsageImpact(std::set<Register> &InRegs,
continue;
if (LiveRegsConsumers[Reg] > 1)
continue;
- PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
+ PSetIterator PSetI = DAG->getMRI()->getPressureSets(VirtRegOrUnit(Reg));
for (; PSetI.isValid(); ++PSetI) {
DiffSetPressure[*PSetI] -= PSetI.getWeight();
}
@@ -1699,7 +1704,7 @@ SIScheduleBlockScheduler::checkRegUsageImpact(std::set<Register> &InRegs,
// For now only track virtual registers.
if (!Reg.isVirtual())
continue;
- PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
+ PSetIterator PSetI = DAG->getMRI()->getPressureSets(VirtRegOrUnit(Reg));
for (; PSetI.isValid(); ++PSetI) {
DiffSetPressure[*PSetI] += PSetI.getWeight();
}
@@ -1846,7 +1851,7 @@ SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
// For now only track virtual registers
if (!Reg.isVirtual())
continue;
- PSetIterator PSetI = MRI.getPressureSets(Reg);
+ PSetIterator PSetI = MRI.getPressureSets(VirtRegOrUnit(Reg));
for (; PSetI.isValid(); ++PSetI) {
if (*PSetI == AMDGPU::RegisterPressureSets::VGPR_32)
VgprUsage += PSetI.getWeight();
diff --git a/llvm/lib/Target/AMDGPU/SIMachineScheduler.h b/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
index b219cbd..1245774 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
@@ -389,7 +389,7 @@ private:
SIBlockSchedCandidate &TryCand);
SIScheduleBlock *pickBlock();
- void addLiveRegs(std::set<Register> &Regs);
+ void addLiveRegs(std::set<VirtRegOrUnit> &Regs);
void decreaseLiveRegs(SIScheduleBlock *Block, std::set<Register> &Regs);
void releaseBlockSuccs(SIScheduleBlock *Parent);
void blockScheduled(SIScheduleBlock *Block);
@@ -462,18 +462,18 @@ public:
unsigned &VgprUsage,
unsigned &SgprUsage);
- std::set<Register> getInRegs() {
- std::set<Register> InRegs;
+ std::set<VirtRegOrUnit> getInRegs() {
+ std::set<VirtRegOrUnit> InRegs;
for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
- InRegs.insert(RegMaskPair.RegUnit);
+ InRegs.insert(RegMaskPair.VRegOrUnit);
}
return InRegs;
}
- std::set<unsigned> getOutRegs() {
- std::set<unsigned> OutRegs;
+ std::set<VirtRegOrUnit> getOutRegs() {
+ std::set<VirtRegOrUnit> OutRegs;
for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
- OutRegs.insert(RegMaskPair.RegUnit);
+ OutRegs.insert(RegMaskPair.VRegOrUnit);
}
return OutRegs;
};
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 6ab8d552..a082d53 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -302,16 +302,17 @@ protected:
SICacheControl(const GCNSubtarget &ST);
- /// Sets named bit \p BitName to "true" if present in instruction \p MI.
+ /// Sets CPol \p Bits to "true" if present in instruction \p MI.
/// \returns Returns true if \p MI is modified, false otherwise.
- bool enableNamedBit(const MachineBasicBlock::iterator MI,
- AMDGPU::CPol::CPol Bit) const;
+ bool enableCPolBits(const MachineBasicBlock::iterator MI,
+ unsigned Bits) const;
/// Check if any atomic operation on AS can affect memory accessible via the
/// global address space.
bool canAffectGlobalAddrSpace(SIAtomicAddrSpace AS) const;
public:
+ using CPol = AMDGPU::CPol::CPol;
/// Create a cache control for the subtarget \p ST.
static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
@@ -401,21 +402,9 @@ public:
virtual ~SICacheControl() = default;
};
-class SIGfx6CacheControl : public SICacheControl {
-protected:
-
- /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
- /// is modified, false otherwise.
- bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
- return enableNamedBit(MI, AMDGPU::CPol::GLC);
- }
-
- /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
- /// is modified, false otherwise.
- bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
- return enableNamedBit(MI, AMDGPU::CPol::SLC);
- }
-
+/// Generates code sequences for the memory model of all GFX targets below
+/// GFX10.
+class SIGfx6CacheControl final : public SICacheControl {
public:
SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
@@ -454,76 +443,10 @@ public:
Position Pos) const override;
};
-class SIGfx7CacheControl : public SIGfx6CacheControl {
+/// Generates code sequences for the memory model of GFX10/11.
+class SIGfx10CacheControl final : public SICacheControl {
public:
-
- SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
-
- bool insertAcquire(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- Position Pos) const override;
-
-};
-
-class SIGfx90ACacheControl : public SIGfx7CacheControl {
-public:
-
- SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
-
- bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace) const override;
-
- bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace) const override;
-
- bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
- SIAtomicAddrSpace AddrSpace, SIMemOp Op,
- bool IsVolatile, bool IsNonTemporal,
- bool IsLastUse) const override;
-
- bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace, SIMemOp Op,
- bool IsCrossAddrSpaceOrdering, Position Pos,
- AtomicOrdering Order, bool AtomicsOnly) const override;
-
- bool insertAcquire(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- Position Pos) const override;
-
- bool insertRelease(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- bool IsCrossAddrSpaceOrdering,
- Position Pos) const override;
-};
-
-class SIGfx940CacheControl : public SIGfx90ACacheControl {
-protected:
-
- /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI
- /// is modified, false otherwise.
- bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {
- return enableNamedBit(MI, AMDGPU::CPol::SC0);
- }
-
- /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI
- /// is modified, false otherwise.
- bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {
- return enableNamedBit(MI, AMDGPU::CPol::SC1);
- }
-
- /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI
- /// is modified, false otherwise.
- bool enableNTBit(const MachineBasicBlock::iterator &MI) const {
- return enableNamedBit(MI, AMDGPU::CPol::NT);
- }
-
-public:
- SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {};
+ SIGfx10CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
@@ -531,42 +454,16 @@ public:
bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace) const override;
+ SIAtomicAddrSpace AddrSpace) const override {
+ return false;
+ }
bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace) const override;
-
- bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
- SIAtomicAddrSpace AddrSpace, SIMemOp Op,
- bool IsVolatile, bool IsNonTemporal,
- bool IsLastUse) const override;
-
- bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace, Position Pos) const override;
-
- bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
- Position Pos) const override;
-};
-
-class SIGfx10CacheControl : public SIGfx7CacheControl {
-protected:
-
- /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
- /// is modified, false otherwise.
- bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
- return enableNamedBit(MI, AMDGPU::CPol::DLC);
+ SIAtomicAddrSpace AddrSpace) const override {
+ return false;
}
-public:
-
- SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
-
- bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace) const override;
-
bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsVolatile, bool IsNonTemporal,
@@ -579,28 +476,23 @@ public:
bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, Position Pos) const override;
-};
-
-class SIGfx11CacheControl : public SIGfx10CacheControl {
-public:
- SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {}
-
- bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace) const override;
- bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
- SIAtomicAddrSpace AddrSpace, SIMemOp Op,
- bool IsVolatile, bool IsNonTemporal,
- bool IsLastUse) const override;
+ bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
+ Position Pos) const override {
+ return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
+ IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
+ /*AtomicsOnly=*/false);
+ }
};
-class SIGfx12CacheControl : public SIGfx11CacheControl {
+class SIGfx12CacheControl final : public SICacheControl {
protected:
// Sets TH policy to \p Value if CPol operand is present in instruction \p MI.
// \returns Returns true if \p MI is modified, false otherwise.
bool setTH(const MachineBasicBlock::iterator MI,
AMDGPU::CPol::CPol Value) const;
+
// Sets Scope policy to \p Value if CPol operand is present in instruction \p
// MI. \returns Returns true if \p MI is modified, false otherwise.
bool setScope(const MachineBasicBlock::iterator MI,
@@ -619,7 +511,7 @@ protected:
SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const;
public:
- SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {
+ SIGfx12CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {
// GFX12.0 and GFX12.5 memory models greatly overlap, and in some cases
// the behavior is the same if assuming GFX12.0 in CU mode.
assert(!ST.hasGFX1250Insts() || ST.isCuModeEnabled());
@@ -777,7 +669,7 @@ getSynchronizeAddrSpaceMD(const MachineInstr &MI) {
void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
const char *Msg) const {
- const Function &Func = MI->getParent()->getParent()->getFunction();
+ const Function &Func = MI->getMF()->getFunction();
Func.getContext().diagnose(
DiagnosticInfoUnsupported(Func, Msg, MI->getDebugLoc()));
}
@@ -884,6 +776,13 @@ std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
}
}
+ // FIXME: The MMO of buffer atomic instructions does not always have an atomic
+ // ordering. We only need to handle VBUFFER atomics on GFX12+ so we can fix it
+ // here, but the lowering should really be cleaned up at some point.
+ if ((ST.getGeneration() >= GCNSubtarget::GFX12) && SIInstrInfo::isBUF(*MI) &&
+ SIInstrInfo::isAtomic(*MI) && Ordering == AtomicOrdering::NotAtomic)
+ Ordering = AtomicOrdering::Monotonic;
+
SIAtomicScope Scope = SIAtomicScope::NONE;
SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
bool IsCrossAddressSpaceOrdering = false;
@@ -1006,13 +905,13 @@ SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
InsertCacheInv = !AmdgcnSkipCacheInvalidations;
}
-bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
- AMDGPU::CPol::CPol Bit) const {
+bool SICacheControl::enableCPolBits(const MachineBasicBlock::iterator MI,
+ unsigned Bits) const {
MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
if (!CPol)
return false;
- CPol->setImm(CPol->getImm() | Bit);
+ CPol->setImm(CPol->getImm() | Bits);
return true;
}
@@ -1028,18 +927,10 @@ bool SICacheControl::canAffectGlobalAddrSpace(SIAtomicAddrSpace AS) const {
/* static */
std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
GCNSubtarget::Generation Generation = ST.getGeneration();
- if (ST.hasGFX940Insts())
- return std::make_unique<SIGfx940CacheControl>(ST);
- if (ST.hasGFX90AInsts())
- return std::make_unique<SIGfx90ACacheControl>(ST);
- if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
- return std::make_unique<SIGfx6CacheControl>(ST);
if (Generation < AMDGPUSubtarget::GFX10)
- return std::make_unique<SIGfx7CacheControl>(ST);
- if (Generation < AMDGPUSubtarget::GFX11)
- return std::make_unique<SIGfx10CacheControl>(ST);
+ return std::make_unique<SIGfx6CacheControl>(ST);
if (Generation < AMDGPUSubtarget::GFX12)
- return std::make_unique<SIGfx11CacheControl>(ST);
+ return std::make_unique<SIGfx10CacheControl>(ST);
return std::make_unique<SIGfx12CacheControl>(ST);
}
@@ -1048,33 +939,61 @@ bool SIGfx6CacheControl::enableLoadCacheBypass(
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace) const {
assert(MI->mayLoad() && !MI->mayStore());
- bool Changed = false;
- if (canAffectGlobalAddrSpace(AddrSpace)) {
- switch (Scope) {
- case SIAtomicScope::SYSTEM:
- case SIAtomicScope::AGENT:
+ if (!canAffectGlobalAddrSpace(AddrSpace)) {
+ /// The scratch address space does not need the global memory caches
+ /// to be bypassed as all memory operations by the same thread are
+ /// sequentially consistent, and no other thread can access scratch
+ /// memory.
+
+ /// Other address spaces do not have a cache.
+ return false;
+ }
+
+ bool Changed = false;
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ if (ST.hasGFX940Insts()) {
+ // Set SC bits to indicate system scope.
+ Changed |= enableCPolBits(MI, CPol::SC0 | CPol::SC1);
+ break;
+ }
+ [[fallthrough]];
+ case SIAtomicScope::AGENT:
+ if (ST.hasGFX940Insts()) {
+ // Set SC bits to indicate agent scope.
+ Changed |= enableCPolBits(MI, CPol::SC1);
+ } else {
// Set L1 cache policy to MISS_EVICT.
// Note: there is no L2 cache bypass policy at the ISA level.
- Changed |= enableGLCBit(MI);
- break;
- case SIAtomicScope::WORKGROUP:
- case SIAtomicScope::WAVEFRONT:
- case SIAtomicScope::SINGLETHREAD:
- // No cache to bypass.
- break;
- default:
- llvm_unreachable("Unsupported synchronization scope");
+ Changed |= enableCPolBits(MI, CPol::GLC);
+ }
+ break;
+ case SIAtomicScope::WORKGROUP:
+ if (ST.hasGFX940Insts()) {
+ // In threadgroup split mode the waves of a work-group can be executing
+ // on different CUs. Therefore need to bypass the L1 which is per CU.
+ // Otherwise in non-threadgroup split mode all waves of a work-group are
+ // on the same CU, and so the L1 does not need to be bypassed. Setting
+ // SC bits to indicate work-group scope will do this automatically.
+ Changed |= enableCPolBits(MI, CPol::SC0);
+ } else if (ST.hasGFX90AInsts()) {
+ // In threadgroup split mode the waves of a work-group can be executing
+ // on different CUs. Therefore need to bypass the L1 which is per CU.
+ // Otherwise in non-threadgroup split mode all waves of a work-group are
+ // on the same CU, and so the L1 does not need to be bypassed.
+ if (ST.isTgSplitEnabled())
+ Changed |= enableCPolBits(MI, CPol::GLC);
}
+ break;
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // No cache to bypass.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
}
- /// The scratch address space does not need the global memory caches
- /// to be bypassed as all memory operations by the same thread are
- /// sequentially consistent, and no other thread can access scratch
- /// memory.
-
- /// Other address spaces do not have a cache.
-
return Changed;
}
@@ -1085,8 +1004,39 @@ bool SIGfx6CacheControl::enableStoreCacheBypass(
assert(!MI->mayLoad() && MI->mayStore());
bool Changed = false;
- /// The L1 cache is write through so does not need to be bypassed. There is no
- /// bypass control for the L2 cache at the isa level.
+ /// For targets other than GFX940, the L1 cache is write through so does not
+ /// need to be bypassed. There is no bypass control for the L2 cache at the
+ /// isa level.
+
+ if (ST.hasGFX940Insts() && canAffectGlobalAddrSpace(AddrSpace)) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ // Set SC bits to indicate system scope.
+ Changed |= enableCPolBits(MI, CPol::SC0 | CPol::SC1);
+ break;
+ case SIAtomicScope::AGENT:
+ // Set SC bits to indicate agent scope.
+ Changed |= enableCPolBits(MI, CPol::SC1);
+ break;
+ case SIAtomicScope::WORKGROUP:
+ // Set SC bits to indicate workgroup scope.
+ Changed |= enableCPolBits(MI, CPol::SC0);
+ break;
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // Leave SC bits unset to indicate wavefront scope.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+
+ /// The scratch address space does not need the global memory caches
+ /// to be bypassed as all memory operations by the same thread are
+ /// sequentially consistent, and no other thread can access scratch
+ /// memory.
+
+ /// Other address spaces do not have a cache.
+ }
return Changed;
}
@@ -1098,10 +1048,31 @@ bool SIGfx6CacheControl::enableRMWCacheBypass(
assert(MI->mayLoad() && MI->mayStore());
bool Changed = false;
- /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
- /// bypassed, and the GLC bit is instead used to indicate if they are
- /// return or no-return.
- /// Note: there is no L2 cache coherent bypass control at the ISA level.
+ /// For targets other than GFX940, do not set GLC for RMW atomic operations as
+ /// L0/L1 cache is automatically bypassed, and the GLC bit is instead used to
+ /// indicate if they are return or no-return. Note: there is no L2 cache
+ /// coherent bypass control at the ISA level.
+ /// For GFX90A+, RMW atomics implicitly bypass the L1 cache.
+
+ if (ST.hasGFX940Insts() && canAffectGlobalAddrSpace(AddrSpace)) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ // Set SC1 bit to indicate system scope.
+ Changed |= enableCPolBits(MI, CPol::SC1);
+ break;
+ case SIAtomicScope::AGENT:
+ case SIAtomicScope::WORKGROUP:
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // RMW atomic operations implicitly bypass the L1 cache and only use SC1
+ // to indicate system or agent scope. The SC0 bit is used to indicate if
+ // they are return or no-return. Leave SC1 bit unset to indicate agent
+ // scope.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
return Changed;
}
@@ -1123,11 +1094,15 @@ bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
bool Changed = false;
if (IsVolatile) {
- // Set L1 cache policy to be MISS_EVICT for load instructions
- // and MISS_LRU for store instructions.
- // Note: there is no L2 cache bypass policy at the ISA level.
- if (Op == SIMemOp::LOAD)
- Changed |= enableGLCBit(MI);
+ if (ST.hasGFX940Insts()) {
+ // Set SC bits to indicate system scope.
+ Changed |= enableCPolBits(MI, CPol::SC0 | CPol::SC1);
+ } else if (Op == SIMemOp::LOAD) {
+ // Set L1 cache policy to be MISS_EVICT for load instructions
+ // and MISS_LRU for store instructions.
+ // Note: there is no L2 cache bypass policy at the ISA level.
+ Changed |= enableCPolBits(MI, CPol::GLC);
+ }
// Ensure operation has completed at system scope to cause all volatile
// operations to be visible outside the program in a global order. Do not
@@ -1142,10 +1117,13 @@ bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
}
if (IsNonTemporal) {
- // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
- // for both loads and stores, and the L2 cache policy to STREAM.
- Changed |= enableGLCBit(MI);
- Changed |= enableSLCBit(MI);
+ if (ST.hasGFX940Insts()) {
+ Changed |= enableCPolBits(MI, CPol::NT);
+ } else {
+ // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
+ // for both loads and stores, and the L2 cache policy to STREAM.
+ Changed |= enableCPolBits(MI, CPol::SLC | CPol::GLC);
+ }
return Changed;
}
@@ -1166,6 +1144,26 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
if (Pos == Position::AFTER)
++MI;
+ // GFX90A+
+ if (ST.hasGFX90AInsts() && ST.isTgSplitEnabled()) {
+ // In threadgroup split mode the waves of a work-group can be executing on
+ // different CUs. Therefore need to wait for global or GDS memory operations
+ // to complete to ensure they are visible to waves in the other CUs.
+ // Otherwise in non-threadgroup split mode all waves of a work-group are on
+ // the same CU, so no need to wait for global memory as all waves in the
+ // work-group access the same the L1, nor wait for GDS as access are ordered
+ // on a CU.
+ if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
+ SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
+ (Scope == SIAtomicScope::WORKGROUP)) {
+ // Same as <GFX90A at AGENT scope;
+ Scope = SIAtomicScope::AGENT;
+ }
+ // In threadgroup split mode LDS cannot be allocated so no need to wait for
+ // LDS memory operations.
+ AddrSpace &= ~SIAtomicAddrSpace::LDS;
+ }
+
bool VMCnt = false;
bool LGKMCnt = false;
@@ -1260,62 +1258,13 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
return Changed;
}
-bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- Position Pos) const {
- if (!InsertCacheInv)
+static bool canUseBUFFER_WBINVL1_VOL(const GCNSubtarget &ST) {
+ if (ST.getGeneration() <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
return false;
-
- bool Changed = false;
-
- MachineBasicBlock &MBB = *MI->getParent();
- DebugLoc DL = MI->getDebugLoc();
-
- if (Pos == Position::AFTER)
- ++MI;
-
- if (canAffectGlobalAddrSpace(AddrSpace)) {
- switch (Scope) {
- case SIAtomicScope::SYSTEM:
- case SIAtomicScope::AGENT:
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
- Changed = true;
- break;
- case SIAtomicScope::WORKGROUP:
- case SIAtomicScope::WAVEFRONT:
- case SIAtomicScope::SINGLETHREAD:
- // No cache to invalidate.
- break;
- default:
- llvm_unreachable("Unsupported synchronization scope");
- }
- }
-
- /// The scratch address space does not need the global memory cache
- /// to be flushed as all memory operations by the same thread are
- /// sequentially consistent, and no other thread can access scratch
- /// memory.
-
- /// Other address spaces do not have a cache.
-
- if (Pos == Position::AFTER)
- --MI;
-
- return Changed;
+ return !ST.isAmdPalOS() && !ST.isMesa3DOS();
}
-bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- bool IsCrossAddrSpaceOrdering,
- Position Pos) const {
- return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
- IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
- /*AtomicsOnly=*/false);
-}
-
-bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
+bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
Position Pos) const {
@@ -1327,235 +1276,95 @@ bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
MachineBasicBlock &MBB = *MI->getParent();
DebugLoc DL = MI->getDebugLoc();
- const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
-
- const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
- ? AMDGPU::BUFFER_WBINVL1
- : AMDGPU::BUFFER_WBINVL1_VOL;
-
if (Pos == Position::AFTER)
++MI;
- if (canAffectGlobalAddrSpace(AddrSpace)) {
- switch (Scope) {
- case SIAtomicScope::SYSTEM:
- case SIAtomicScope::AGENT:
- BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
- Changed = true;
- break;
- case SIAtomicScope::WORKGROUP:
- case SIAtomicScope::WAVEFRONT:
- case SIAtomicScope::SINGLETHREAD:
- // No cache to invalidate.
- break;
- default:
- llvm_unreachable("Unsupported synchronization scope");
- }
- }
-
- /// The scratch address space does not need the global memory cache
- /// to be flushed as all memory operations by the same thread are
- /// sequentially consistent, and no other thread can access scratch
- /// memory.
-
- /// Other address spaces do not have a cache.
-
- if (Pos == Position::AFTER)
- --MI;
-
- return Changed;
-}
-
-bool SIGfx90ACacheControl::enableLoadCacheBypass(
- const MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace) const {
- assert(MI->mayLoad() && !MI->mayStore());
- bool Changed = false;
+ const unsigned InvalidateL1 = canUseBUFFER_WBINVL1_VOL(ST)
+ ? AMDGPU::BUFFER_WBINVL1_VOL
+ : AMDGPU::BUFFER_WBINVL1;
if (canAffectGlobalAddrSpace(AddrSpace)) {
switch (Scope) {
case SIAtomicScope::SYSTEM:
- case SIAtomicScope::AGENT:
- // Set the L1 cache policy to MISS_LRU.
- // Note: there is no L2 cache bypass policy at the ISA level.
- Changed |= enableGLCBit(MI);
- break;
- case SIAtomicScope::WORKGROUP:
- // In threadgroup split mode the waves of a work-group can be executing on
- // different CUs. Therefore need to bypass the L1 which is per CU.
- // Otherwise in non-threadgroup split mode all waves of a work-group are
- // on the same CU, and so the L1 does not need to be bypassed.
- if (ST.isTgSplitEnabled())
- Changed |= enableGLCBit(MI);
- break;
- case SIAtomicScope::WAVEFRONT:
- case SIAtomicScope::SINGLETHREAD:
- // No cache to bypass.
- break;
- default:
- llvm_unreachable("Unsupported synchronization scope");
- }
- }
-
- /// The scratch address space does not need the global memory caches
- /// to be bypassed as all memory operations by the same thread are
- /// sequentially consistent, and no other thread can access scratch
- /// memory.
-
- /// Other address spaces do not have a cache.
-
- return Changed;
-}
-
-bool SIGfx90ACacheControl::enableRMWCacheBypass(
- const MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace) const {
- assert(MI->mayLoad() && MI->mayStore());
- bool Changed = false;
+ if (ST.hasGFX940Insts()) {
+ // Ensures that following loads will not see stale remote VMEM data or
+ // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW
+ // and CC will never be stale due to the local memory probes.
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
+ // Set SC bits to indicate system scope.
+ .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
+ // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
+ // hardware does not reorder memory operations by the same wave with
+ // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
+ // remove any cache lines of earlier writes by the same wave and ensures
+ // later reads by the same wave will refetch the cache lines.
+ Changed = true;
+ break;
+ }
- if (canAffectGlobalAddrSpace(AddrSpace)) {
- switch (Scope) {
- case SIAtomicScope::SYSTEM:
+ if (ST.hasGFX90AInsts()) {
+ // Ensures that following loads will not see stale remote VMEM data or
+ // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW
+ // and CC will never be stale due to the local memory probes.
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
+ BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
+ // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
+ // hardware does not reorder memory operations by the same wave with
+ // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed
+ // to remove any cache lines of earlier writes by the same wave and
+ // ensures later reads by the same wave will refetch the cache lines.
+ Changed = true;
+ break;
+ }
+ [[fallthrough]];
case SIAtomicScope::AGENT:
- /// Do not set glc for RMW atomic operations as they implicitly bypass
- /// the L1 cache, and the glc bit is instead used to indicate if they are
- /// return or no-return.
- break;
- case SIAtomicScope::WORKGROUP:
- case SIAtomicScope::WAVEFRONT:
- case SIAtomicScope::SINGLETHREAD:
- // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
- break;
- default:
- llvm_unreachable("Unsupported synchronization scope");
- }
- }
-
- return Changed;
-}
-
-bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
- MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
- bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
- // Only handle load and store, not atomic read-modify-write insructions. The
- // latter use glc to indicate if the atomic returns a result and so must not
- // be used for cache control.
- assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
-
- // Only update load and store, not LLVM IR atomic read-modify-write
- // instructions. The latter are always marked as volatile so cannot sensibly
- // handle it as do not want to pessimize all atomics. Also they do not support
- // the nontemporal attribute.
- assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
-
- bool Changed = false;
-
- if (IsVolatile) {
- // Set L1 cache policy to be MISS_EVICT for load instructions
- // and MISS_LRU for store instructions.
- // Note: there is no L2 cache bypass policy at the ISA level.
- if (Op == SIMemOp::LOAD)
- Changed |= enableGLCBit(MI);
-
- // Ensure operation has completed at system scope to cause all volatile
- // operations to be visible outside the program in a global order. Do not
- // request cross address space as only the global address space can be
- // observable outside the program, so no need to cause a waitcnt for LDS
- // address space operations.
- Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
- Position::AFTER, AtomicOrdering::Unordered,
- /*AtomicsOnly=*/false);
-
- return Changed;
- }
-
- if (IsNonTemporal) {
- // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
- // for both loads and stores, and the L2 cache policy to STREAM.
- Changed |= enableGLCBit(MI);
- Changed |= enableSLCBit(MI);
- return Changed;
- }
-
- return Changed;
-}
-
-bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace, SIMemOp Op,
- bool IsCrossAddrSpaceOrdering,
- Position Pos, AtomicOrdering Order,
- bool AtomicsOnly) const {
- if (ST.isTgSplitEnabled()) {
- // In threadgroup split mode the waves of a work-group can be executing on
- // different CUs. Therefore need to wait for global or GDS memory operations
- // to complete to ensure they are visible to waves in the other CUs.
- // Otherwise in non-threadgroup split mode all waves of a work-group are on
- // the same CU, so no need to wait for global memory as all waves in the
- // work-group access the same the L1, nor wait for GDS as access are ordered
- // on a CU.
- if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
- SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
- (Scope == SIAtomicScope::WORKGROUP)) {
- // Same as GFX7 using agent scope.
- Scope = SIAtomicScope::AGENT;
- }
- // In threadgroup split mode LDS cannot be allocated so no need to wait for
- // LDS memory operations.
- AddrSpace &= ~SIAtomicAddrSpace::LDS;
- }
- return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
- IsCrossAddrSpaceOrdering, Pos, Order,
- AtomicsOnly);
-}
-
-bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- Position Pos) const {
- if (!InsertCacheInv)
- return false;
-
- bool Changed = false;
-
- MachineBasicBlock &MBB = *MI->getParent();
- DebugLoc DL = MI->getDebugLoc();
-
- if (Pos == Position::AFTER)
- ++MI;
-
- if (canAffectGlobalAddrSpace(AddrSpace)) {
- switch (Scope) {
- case SIAtomicScope::SYSTEM:
- // Ensures that following loads will not see stale remote VMEM data or
- // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
- // CC will never be stale due to the local memory probes.
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
- // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
- // hardware does not reorder memory operations by the same wave with
- // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
- // remove any cache lines of earlier writes by the same wave and ensures
- // later reads by the same wave will refetch the cache lines.
+ if (ST.hasGFX940Insts()) {
+ // Ensures that following loads will not see stale remote date or local
+ // MTYPE NC global data. Local MTYPE RW and CC memory will never be
+ // stale due to the memory probes.
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
+ // Set SC bits to indicate agent scope.
+ .addImm(AMDGPU::CPol::SC1);
+ // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
+ // does not reorder memory operations with respect to preceeding buffer
+ // invalidate. The invalidate is guaranteed to remove any cache lines of
+ // earlier writes and ensures later writes will refetch the cache lines.
+ } else
+ BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
Changed = true;
break;
- case SIAtomicScope::AGENT:
- // Same as GFX7.
- break;
case SIAtomicScope::WORKGROUP:
- // In threadgroup split mode the waves of a work-group can be executing on
- // different CUs. Therefore need to invalidate the L1 which is per CU.
- // Otherwise in non-threadgroup split mode all waves of a work-group are
- // on the same CU, and so the L1 does not need to be invalidated.
if (ST.isTgSplitEnabled()) {
- // Same as GFX7 using agent scope.
- Scope = SIAtomicScope::AGENT;
+ if (ST.hasGFX940Insts()) {
+ // In threadgroup split mode the waves of a work-group can be
+ // executing on different CUs. Therefore need to invalidate the L1
+ // which is per CU. Otherwise in non-threadgroup split mode all waves
+ // of a work-group are on the same CU, and so the L1 does not need to
+ // be invalidated.
+
+ // Ensures L1 is invalidated if in threadgroup split mode. In
+ // non-threadgroup split mode it is a NOP, but no point generating it
+ // in that case if know not in that mode.
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
+ // Set SC bits to indicate work-group scope.
+ .addImm(AMDGPU::CPol::SC0);
+ // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
+ // does not reorder memory operations with respect to preceeding
+ // buffer invalidate. The invalidate is guaranteed to remove any cache
+ // lines of earlier writes and ensures later writes will refetch the
+ // cache lines.
+ Changed = true;
+ } else if (ST.hasGFX90AInsts()) {
+ BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
+ Changed = true;
+ }
}
break;
case SIAtomicScope::WAVEFRONT:
case SIAtomicScope::SINGLETHREAD:
- // Same as GFX7.
+ // For GFX940, we could generate "BUFFER_INV" but it would do nothing as
+ // there are no caches to invalidate. All other targets have no cache to
+ // invalidate.
break;
default:
llvm_unreachable("Unsupported synchronization scope");
@@ -1572,356 +1381,65 @@ bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
if (Pos == Position::AFTER)
--MI;
- Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
-
return Changed;
}
-bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- bool IsCrossAddrSpaceOrdering,
- Position Pos) const {
- bool Changed = false;
-
- MachineBasicBlock &MBB = *MI->getParent();
- const DebugLoc &DL = MI->getDebugLoc();
-
- if (Pos == Position::AFTER)
- ++MI;
-
- if (canAffectGlobalAddrSpace(AddrSpace)) {
- switch (Scope) {
- case SIAtomicScope::SYSTEM:
- // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
- // hardware does not reorder memory operations by the same wave with
- // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
- // to initiate writeback of any dirty cache lines of earlier writes by the
- // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
- // writeback has completed.
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
- // Set SC bits to indicate system scope.
- .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
- // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
- // vmcnt(0)" needed by the "BUFFER_WBL2".
- Changed = true;
- break;
- case SIAtomicScope::AGENT:
- case SIAtomicScope::WORKGROUP:
- case SIAtomicScope::WAVEFRONT:
- case SIAtomicScope::SINGLETHREAD:
- // Same as GFX7.
- break;
- default:
- llvm_unreachable("Unsupported synchronization scope");
- }
- }
-
- if (Pos == Position::AFTER)
- --MI;
-
- Changed |=
- SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
- IsCrossAddrSpaceOrdering, Pos);
-
- return Changed;
-}
-
-bool SIGfx940CacheControl::enableLoadCacheBypass(
- const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace) const {
- assert(MI->mayLoad() && !MI->mayStore());
- bool Changed = false;
-
- if (canAffectGlobalAddrSpace(AddrSpace)) {
- switch (Scope) {
- case SIAtomicScope::SYSTEM:
- // Set SC bits to indicate system scope.
- Changed |= enableSC0Bit(MI);
- Changed |= enableSC1Bit(MI);
- break;
- case SIAtomicScope::AGENT:
- // Set SC bits to indicate agent scope.
- Changed |= enableSC1Bit(MI);
- break;
- case SIAtomicScope::WORKGROUP:
- // In threadgroup split mode the waves of a work-group can be executing on
- // different CUs. Therefore need to bypass the L1 which is per CU.
- // Otherwise in non-threadgroup split mode all waves of a work-group are
- // on the same CU, and so the L1 does not need to be bypassed. Setting SC
- // bits to indicate work-group scope will do this automatically.
- Changed |= enableSC0Bit(MI);
- break;
- case SIAtomicScope::WAVEFRONT:
- case SIAtomicScope::SINGLETHREAD:
- // Leave SC bits unset to indicate wavefront scope.
- break;
- default:
- llvm_unreachable("Unsupported synchronization scope");
- }
- }
-
- /// The scratch address space does not need the global memory caches
- /// to be bypassed as all memory operations by the same thread are
- /// sequentially consistent, and no other thread can access scratch
- /// memory.
-
- /// Other address spaces do not have a cache.
-
- return Changed;
-}
-
-bool SIGfx940CacheControl::enableStoreCacheBypass(
- const MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {
- assert(!MI->mayLoad() && MI->mayStore());
- bool Changed = false;
-
- if (canAffectGlobalAddrSpace(AddrSpace)) {
- switch (Scope) {
- case SIAtomicScope::SYSTEM:
- // Set SC bits to indicate system scope.
- Changed |= enableSC0Bit(MI);
- Changed |= enableSC1Bit(MI);
- break;
- case SIAtomicScope::AGENT:
- // Set SC bits to indicate agent scope.
- Changed |= enableSC1Bit(MI);
- break;
- case SIAtomicScope::WORKGROUP:
- // Set SC bits to indicate workgroup scope.
- Changed |= enableSC0Bit(MI);
- break;
- case SIAtomicScope::WAVEFRONT:
- case SIAtomicScope::SINGLETHREAD:
- // Leave SC bits unset to indicate wavefront scope.
- break;
- default:
- llvm_unreachable("Unsupported synchronization scope");
- }
- }
-
- /// The scratch address space does not need the global memory caches
- /// to be bypassed as all memory operations by the same thread are
- /// sequentially consistent, and no other thread can access scratch
- /// memory.
-
- /// Other address spaces do not have a cache.
-
- return Changed;
-}
-
-bool SIGfx940CacheControl::enableRMWCacheBypass(
- const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace) const {
- assert(MI->mayLoad() && MI->mayStore());
- bool Changed = false;
-
- if (canAffectGlobalAddrSpace(AddrSpace)) {
- switch (Scope) {
- case SIAtomicScope::SYSTEM:
- // Set SC1 bit to indicate system scope.
- Changed |= enableSC1Bit(MI);
- break;
- case SIAtomicScope::AGENT:
- case SIAtomicScope::WORKGROUP:
- case SIAtomicScope::WAVEFRONT:
- case SIAtomicScope::SINGLETHREAD:
- // RMW atomic operations implicitly bypass the L1 cache and only use SC1
- // to indicate system or agent scope. The SC0 bit is used to indicate if
- // they are return or no-return. Leave SC1 bit unset to indicate agent
- // scope.
- break;
- default:
- llvm_unreachable("Unsupported synchronization scope");
- }
- }
-
- return Changed;
-}
-
-bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
- MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
- bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
- // Only handle load and store, not atomic read-modify-write insructions. The
- // latter use glc to indicate if the atomic returns a result and so must not
- // be used for cache control.
- assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
-
- // Only update load and store, not LLVM IR atomic read-modify-write
- // instructions. The latter are always marked as volatile so cannot sensibly
- // handle it as do not want to pessimize all atomics. Also they do not support
- // the nontemporal attribute.
- assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
-
- bool Changed = false;
-
- if (IsVolatile) {
- // Set SC bits to indicate system scope.
- Changed |= enableSC0Bit(MI);
- Changed |= enableSC1Bit(MI);
-
- // Ensure operation has completed at system scope to cause all volatile
- // operations to be visible outside the program in a global order. Do not
- // request cross address space as only the global address space can be
- // observable outside the program, so no need to cause a waitcnt for LDS
- // address space operations.
- Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
- Position::AFTER, AtomicOrdering::Unordered,
- /*AtomicsOnly=*/false);
-
- return Changed;
- }
-
- if (IsNonTemporal) {
- Changed |= enableNTBit(MI);
- return Changed;
- }
-
- return Changed;
-}
-
-bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- Position Pos) const {
- if (!InsertCacheInv)
- return false;
-
+bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ bool IsCrossAddrSpaceOrdering,
+ Position Pos) const {
bool Changed = false;
- MachineBasicBlock &MBB = *MI->getParent();
- DebugLoc DL = MI->getDebugLoc();
+ if (ST.hasGFX90AInsts()) {
+ MachineBasicBlock &MBB = *MI->getParent();
+ const DebugLoc &DL = MI->getDebugLoc();
- if (Pos == Position::AFTER)
- ++MI;
+ if (Pos == Position::AFTER)
+ ++MI;
- if (canAffectGlobalAddrSpace(AddrSpace)) {
- switch (Scope) {
- case SIAtomicScope::SYSTEM:
- // Ensures that following loads will not see stale remote VMEM data or
- // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
- // CC will never be stale due to the local memory probes.
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
- // Set SC bits to indicate system scope.
- .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
- // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
- // hardware does not reorder memory operations by the same wave with
- // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
- // remove any cache lines of earlier writes by the same wave and ensures
- // later reads by the same wave will refetch the cache lines.
- Changed = true;
- break;
- case SIAtomicScope::AGENT:
- // Ensures that following loads will not see stale remote date or local
- // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale
- // due to the memory probes.
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
- // Set SC bits to indicate agent scope.
- .addImm(AMDGPU::CPol::SC1);
- // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
- // does not reorder memory operations with respect to preceeding buffer
- // invalidate. The invalidate is guaranteed to remove any cache lines of
- // earlier writes and ensures later writes will refetch the cache lines.
- Changed = true;
- break;
- case SIAtomicScope::WORKGROUP:
- // In threadgroup split mode the waves of a work-group can be executing on
- // different CUs. Therefore need to invalidate the L1 which is per CU.
- // Otherwise in non-threadgroup split mode all waves of a work-group are
- // on the same CU, and so the L1 does not need to be invalidated.
- if (ST.isTgSplitEnabled()) {
- // Ensures L1 is invalidated if in threadgroup split mode. In
- // non-threadgroup split mode it is a NOP, but no point generating it in
- // that case if know not in that mode.
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
- // Set SC bits to indicate work-group scope.
- .addImm(AMDGPU::CPol::SC0);
- // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
- // does not reorder memory operations with respect to preceeding buffer
- // invalidate. The invalidate is guaranteed to remove any cache lines of
- // earlier writes and ensures later writes will refetch the cache lines.
+ if (canAffectGlobalAddrSpace(AddrSpace)) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
+ // hardware does not reorder memory operations by the same wave with
+ // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
+ // to initiate writeback of any dirty cache lines of earlier writes by
+ // the same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
+ // writeback has completed.
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
+ // Set SC bits to indicate system scope.
+ .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
Changed = true;
+ break;
+ case SIAtomicScope::AGENT:
+ if (ST.hasGFX940Insts()) {
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
+ // Set SC bits to indicate agent scope.
+ .addImm(AMDGPU::CPol::SC1);
+
+ // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
+ // SIAtomicScope::AGENT, the following insertWait will generate the
+ // required "S_WAITCNT vmcnt(0)".
+ Changed = true;
+ }
+ break;
+ case SIAtomicScope::WORKGROUP:
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // For GFX940, do not generate "BUFFER_WBL2" as there are no caches it
+ // would writeback, and would require an otherwise unnecessary
+ // "S_WAITCNT vmcnt(0)".
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
}
- break;
- case SIAtomicScope::WAVEFRONT:
- case SIAtomicScope::SINGLETHREAD:
- // Could generate "BUFFER_INV" but it would do nothing as there are no
- // caches to invalidate.
- break;
- default:
- llvm_unreachable("Unsupported synchronization scope");
}
- }
-
- /// The scratch address space does not need the global memory cache
- /// to be flushed as all memory operations by the same thread are
- /// sequentially consistent, and no other thread can access scratch
- /// memory.
-
- /// Other address spaces do not have a cache.
-
- if (Pos == Position::AFTER)
- --MI;
-
- return Changed;
-}
-
-bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- bool IsCrossAddrSpaceOrdering,
- Position Pos) const {
- bool Changed = false;
- MachineBasicBlock &MBB = *MI->getParent();
- DebugLoc DL = MI->getDebugLoc();
-
- if (Pos == Position::AFTER)
- ++MI;
-
- if (canAffectGlobalAddrSpace(AddrSpace)) {
- switch (Scope) {
- case SIAtomicScope::SYSTEM:
- // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
- // hardware does not reorder memory operations by the same wave with
- // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
- // to initiate writeback of any dirty cache lines of earlier writes by the
- // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
- // writeback has completed.
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
- // Set SC bits to indicate system scope.
- .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
- // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
- // SIAtomicScope::SYSTEM, the following insertWait will generate the
- // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".
- Changed = true;
- break;
- case SIAtomicScope::AGENT:
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
- // Set SC bits to indicate agent scope.
- .addImm(AMDGPU::CPol::SC1);
-
- // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
- // SIAtomicScope::AGENT, the following insertWait will generate the
- // required "S_WAITCNT vmcnt(0)".
- Changed = true;
- break;
- case SIAtomicScope::WORKGROUP:
- case SIAtomicScope::WAVEFRONT:
- case SIAtomicScope::SINGLETHREAD:
- // Do not generate "BUFFER_WBL2" as there are no caches it would
- // writeback, and would require an otherwise unnecessary
- // "S_WAITCNT vmcnt(0)".
- break;
- default:
- llvm_unreachable("Unsupported synchronization scope");
- }
+ if (Pos == Position::AFTER)
+ --MI;
}
- if (Pos == Position::AFTER)
- --MI;
-
// Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
// S_WAITCNT needed.
Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
@@ -1932,8 +1450,7 @@ bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
}
bool SIGfx10CacheControl::enableLoadCacheBypass(
- const MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
+ const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace) const {
assert(MI->mayLoad() && !MI->mayStore());
bool Changed = false;
@@ -1944,8 +1461,9 @@ bool SIGfx10CacheControl::enableLoadCacheBypass(
case SIAtomicScope::AGENT:
// Set the L0 and L1 cache policies to MISS_EVICT.
// Note: there is no L2 cache coherent bypass control at the ISA level.
- Changed |= enableGLCBit(MI);
- Changed |= enableDLCBit(MI);
+ // For GFX10, set GLC+DLC, for GFX11, only set GLC.
+ Changed |=
+ enableCPolBits(MI, CPol::GLC | (AMDGPU::isGFX10(ST) ? CPol::DLC : 0));
break;
case SIAtomicScope::WORKGROUP:
// In WGP mode the waves of a work-group can be executing on either CU of
@@ -1953,7 +1471,7 @@ bool SIGfx10CacheControl::enableLoadCacheBypass(
// CU mode all waves of a work-group are on the same CU, and so the L0
// does not need to be bypassed.
if (!ST.isCuModeEnabled())
- Changed |= enableGLCBit(MI);
+ Changed |= enableCPolBits(MI, CPol::GLC);
break;
case SIAtomicScope::WAVEFRONT:
case SIAtomicScope::SINGLETHREAD:
@@ -1996,10 +1514,13 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
// and MISS_LRU for store instructions.
// Note: there is no L2 cache coherent bypass control at the ISA level.
if (Op == SIMemOp::LOAD) {
- Changed |= enableGLCBit(MI);
- Changed |= enableDLCBit(MI);
+ Changed |= enableCPolBits(MI, CPol::GLC | CPol::DLC);
}
+ // GFX11: Set MALL NOALLOC for both load and store instructions.
+ if (AMDGPU::isGFX11(ST))
+ Changed |= enableCPolBits(MI, CPol::DLC);
+
// Ensure operation has completed at system scope to cause all volatile
// operations to be visible outside the program in a global order. Do not
// request cross address space as only the global address space can be
@@ -2017,8 +1538,12 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
// For stores setting both GLC and SLC configures L0 and L1 cache policy
// to MISS_EVICT and the L2 cache policy to STREAM.
if (Op == SIMemOp::STORE)
- Changed |= enableGLCBit(MI);
- Changed |= enableSLCBit(MI);
+ Changed |= enableCPolBits(MI, CPol::GLC);
+ Changed |= enableCPolBits(MI, CPol::SLC);
+
+ // GFX11: Set MALL NOALLOC for both load and store instructions.
+ if (AMDGPU::isGFX11(ST))
+ Changed |= enableCPolBits(MI, CPol::DLC);
return Changed;
}
@@ -2218,102 +1743,6 @@ bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
return Changed;
}
-bool SIGfx11CacheControl::enableLoadCacheBypass(
- const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace) const {
- assert(MI->mayLoad() && !MI->mayStore());
- bool Changed = false;
-
- if (canAffectGlobalAddrSpace(AddrSpace)) {
- switch (Scope) {
- case SIAtomicScope::SYSTEM:
- case SIAtomicScope::AGENT:
- // Set the L0 and L1 cache policies to MISS_EVICT.
- // Note: there is no L2 cache coherent bypass control at the ISA level.
- Changed |= enableGLCBit(MI);
- break;
- case SIAtomicScope::WORKGROUP:
- // In WGP mode the waves of a work-group can be executing on either CU of
- // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
- // CU mode all waves of a work-group are on the same CU, and so the L0
- // does not need to be bypassed.
- if (!ST.isCuModeEnabled())
- Changed |= enableGLCBit(MI);
- break;
- case SIAtomicScope::WAVEFRONT:
- case SIAtomicScope::SINGLETHREAD:
- // No cache to bypass.
- break;
- default:
- llvm_unreachable("Unsupported synchronization scope");
- }
- }
-
- /// The scratch address space does not need the global memory caches
- /// to be bypassed as all memory operations by the same thread are
- /// sequentially consistent, and no other thread can access scratch
- /// memory.
-
- /// Other address spaces do not have a cache.
-
- return Changed;
-}
-
-bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
- MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
- bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
-
- // Only handle load and store, not atomic read-modify-write insructions. The
- // latter use glc to indicate if the atomic returns a result and so must not
- // be used for cache control.
- assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
-
- // Only update load and store, not LLVM IR atomic read-modify-write
- // instructions. The latter are always marked as volatile so cannot sensibly
- // handle it as do not want to pessimize all atomics. Also they do not support
- // the nontemporal attribute.
- assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
-
- bool Changed = false;
-
- if (IsVolatile) {
- // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
- // and MISS_LRU for store instructions.
- // Note: there is no L2 cache coherent bypass control at the ISA level.
- if (Op == SIMemOp::LOAD)
- Changed |= enableGLCBit(MI);
-
- // Set MALL NOALLOC for load and store instructions.
- Changed |= enableDLCBit(MI);
-
- // Ensure operation has completed at system scope to cause all volatile
- // operations to be visible outside the program in a global order. Do not
- // request cross address space as only the global address space can be
- // observable outside the program, so no need to cause a waitcnt for LDS
- // address space operations.
- Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
- Position::AFTER, AtomicOrdering::Unordered,
- /*AtomicsOnly=*/false);
- return Changed;
- }
-
- if (IsNonTemporal) {
- // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
- // and L2 cache policy to STREAM.
- // For stores setting both GLC and SLC configures L0 and L1 cache policy
- // to MISS_EVICT and the L2 cache policy to STREAM.
- if (Op == SIMemOp::STORE)
- Changed |= enableGLCBit(MI);
- Changed |= enableSLCBit(MI);
-
- // Set MALL NOALLOC for load and store instructions.
- Changed |= enableDLCBit(MI);
- return Changed;
- }
-
- return Changed;
-}
-
bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI,
AMDGPU::CPol::CPol Value) const {
MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
@@ -2637,6 +2066,13 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
if (IsVolatile) {
Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
+ if (ST.requiresWaitXCntForSingleAccessInstructions() &&
+ SIInstrInfo::isVMEM(*MI)) {
+ MachineBasicBlock &MBB = *MI->getParent();
+ BuildMI(MBB, MI, MI->getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0);
+ Changed = true;
+ }
+
// Ensure operation has completed at system scope to cause all volatile
// operations to be visible outside the program in a global order. Do not
// request cross address space as only the global address space can be
@@ -2655,9 +2091,8 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
const bool IsRMW = (MI.mayLoad() && MI.mayStore());
bool Changed = false;
- // GFX12.5 only: xcnt wait is needed before flat and global atomics
- // stores/rmw.
- if (Atomic && ST.requiresWaitXCntBeforeAtomicStores() && TII->isFLAT(MI)) {
+ if (Atomic && ST.requiresWaitXCntForSingleAccessInstructions() &&
+ SIInstrInfo::isVMEM(MI)) {
MachineBasicBlock &MBB = *MI.getParent();
BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0);
Changed = true;
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index bfac639..acc4b3f 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -118,7 +118,7 @@ public:
MachineInstr *getParentInst() const { return Target->getParent(); }
MachineRegisterInfo *getMRI() const {
- return &getParentInst()->getParent()->getParent()->getRegInfo();
+ return &getParentInst()->getMF()->getRegInfo();
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -1284,7 +1284,7 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
// Clone the instruction to allow revoking changes
// made to MI during the processing of the operands
// if the conversion fails.
- SDWAInst = MI.getParent()->getParent()->CloneMachineInstr(&MI);
+ SDWAInst = MI.getMF()->CloneMachineInstr(&MI);
MI.getParent()->insert(MI.getIterator(), SDWAInst);
} else {
SDWAInst = createSDWAVersion(MI);
@@ -1334,20 +1334,21 @@ void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI,
const MCInstrDesc &Desc = TII->get(MI.getOpcode());
unsigned ConstantBusCount = 0;
for (MachineOperand &Op : MI.explicit_uses()) {
- if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg())))
- continue;
-
- unsigned I = Op.getOperandNo();
+ if (Op.isReg()) {
+ if (TRI->isVGPR(*MRI, Op.getReg()))
+ continue;
- int16_t RegClass = TII->getOpRegClassID(Desc.operands()[I]);
- if (RegClass == -1 || !TRI->isVSSuperClass(TRI->getRegClass(RegClass)))
+ if (ST.hasSDWAScalar() && ConstantBusCount == 0) {
+ ++ConstantBusCount;
+ continue;
+ }
+ } else if (!Op.isImm())
continue;
- if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() &&
- TRI->isSGPRReg(*MRI, Op.getReg())) {
- ++ConstantBusCount;
+ unsigned I = Op.getOperandNo();
+ const TargetRegisterClass *OpRC = TII->getRegClass(Desc, I);
+ if (!OpRC || !TRI->isVSSuperClass(OpRC))
continue;
- }
Register VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
diff --git a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
index 5720b97..b537e44 100644
--- a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
@@ -110,7 +110,7 @@ void SIPostRABundler::collectUsedRegUnits(const MachineInstr &MI,
"subregister indexes should not be present after RA");
for (MCRegUnit Unit : TRI->regunits(Reg))
- UsedRegUnits.set(Unit);
+ UsedRegUnits.set(static_cast<unsigned>(Unit));
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index 7431e11..8785968 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -296,7 +296,7 @@ bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
for (MachineBasicBlock::instr_iterator I = std::next(First.getIterator()),
E = MI.getIterator();
I != E; ++I) {
- if (I->isBundle())
+ if (I->isBundle() || I->isDebugInstr())
continue;
switch (I->getOpcode()) {
case AMDGPU::S_SET_GPR_IDX_MODE:
@@ -640,7 +640,7 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
}
void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) {
- MachineOperand DstOp = I.getOperand(0);
+ const MachineOperand &DstOp = I.getOperand(0);
uint16_t UnpackedOpcode = mapToUnpackedOpcode(I);
assert(UnpackedOpcode != std::numeric_limits<uint16_t>::max() &&
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index a6c1af2..66586e8 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -340,10 +340,12 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST)
"getNumCoveredRegs() will not work with generated subreg masks!");
RegPressureIgnoredUnits.resize(getNumRegUnits());
- RegPressureIgnoredUnits.set(*regunits(MCRegister::from(AMDGPU::M0)).begin());
+ RegPressureIgnoredUnits.set(
+ static_cast<unsigned>(*regunits(MCRegister::from(AMDGPU::M0)).begin()));
for (auto Reg : AMDGPU::VGPR_16RegClass) {
if (AMDGPU::isHi16Reg(Reg, *this))
- RegPressureIgnoredUnits.set(*regunits(Reg).begin());
+ RegPressureIgnoredUnits.set(
+ static_cast<unsigned>(*regunits(Reg).begin()));
}
// HACK: Until this is fully tablegen'd.
@@ -1949,7 +1951,7 @@ void SIRegisterInfo::buildSpillLoadStore(
void SIRegisterInfo::addImplicitUsesForBlockCSRLoad(MachineInstrBuilder &MIB,
Register BlockReg) const {
- const MachineFunction *MF = MIB->getParent()->getParent();
+ const MachineFunction *MF = MIB->getMF();
const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(BlockReg);
Register BaseVGPR = getSubReg(BlockReg, AMDGPU::sub0);
@@ -2319,7 +2321,7 @@ bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
int SPAdj, unsigned FIOperandNum,
RegScavenger *RS) const {
- MachineFunction *MF = MI->getParent()->getParent();
+ MachineFunction *MF = MI->getMF();
MachineBasicBlock *MBB = MI->getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
MachineFrameInfo &FrameInfo = MF->getFrameInfo();
@@ -2981,10 +2983,36 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
: RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
MI, false, 0, !UseSGPR);
- // TODO: for flat scratch another attempt can be made with a VGPR index
- // if no SGPRs can be scavenged.
- if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR))
+ if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR)) {
+ int SVOpcode = AMDGPU::getFlatScratchInstSVfromSS(MI->getOpcode());
+ if (ST.hasFlatScratchSVSMode() && SVOpcode != -1) {
+ Register TmpVGPR = RS->scavengeRegisterBackwards(
+ AMDGPU::VGPR_32RegClass, MI, false, 0, /*AllowSpill=*/true);
+
+ // Materialize the frame register.
+ auto MIB =
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR);
+ if (FrameReg)
+ MIB.addReg(FrameReg);
+ else
+ MIB.addImm(Offset);
+
+ // Add the offset to the frame register.
+ if (FrameReg && Offset)
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32), FrameReg)
+ .addReg(FrameReg, RegState::Kill)
+ .addImm(Offset);
+
+ BuildMI(*MBB, MI, DL, TII->get(SVOpcode))
+ .add(MI->getOperand(0)) // $vdata
+ .addReg(TmpVGPR) // $vaddr
+ .addImm(0) // Offset
+ .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::cpol));
+ MI->eraseFromParent();
+ return true;
+ }
report_fatal_error("Cannot scavenge register in FI elimination!");
+ }
if (!TmpSReg) {
// Use frame register and restore it after.
@@ -3046,7 +3074,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
if (!IsMUBUF && !MFI->isBottomOfStack()) {
// Convert to a swizzled stack address by scaling by the wave size.
// In an entry function/kernel the offset is already swizzled.
- bool IsSALU = isSGPRClass(TII->getOpRegClass(*MI, FIOperandNum));
+ bool IsSALU = isSGPRClass(TII->getRegClass(MI->getDesc(), FIOperandNum));
bool LiveSCC = RS->isRegUsed(AMDGPU::SCC) &&
!MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr);
const TargetRegisterClass *RC = IsSALU && !LiveSCC
@@ -3558,6 +3586,17 @@ SIRegisterInfo::getVectorSuperClassForBitWidth(unsigned BitWidth) const {
}
const TargetRegisterClass *
+SIRegisterInfo::getDefaultVectorSuperClassForBitWidth(unsigned BitWidth) const {
+ // TODO: In principle this should use AV classes for gfx908 too. This is
+ // limited to 90a+ to avoid regressing special case copy optimizations which
+ // need new handling. The core issue is that it's not possible to directly
+ // copy between AGPRs on gfx908, and the current optimizations around that
+ // expect to see copies to VGPR.
+ return ST.hasGFX90AInsts() ? getVectorSuperClassForBitWidth(BitWidth)
+ : getVGPRClassForBitWidth(BitWidth);
+}
+
+const TargetRegisterClass *
SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) {
if (BitWidth == 16 || BitWidth == 32)
return &AMDGPU::SReg_32RegClass;
@@ -3628,6 +3667,14 @@ SIRegisterInfo::getEquivalentAGPRClass(const TargetRegisterClass *SRC) const {
}
const TargetRegisterClass *
+SIRegisterInfo::getEquivalentAVClass(const TargetRegisterClass *SRC) const {
+ unsigned Size = getRegSizeInBits(*SRC);
+ const TargetRegisterClass *ARC = getVectorSuperClassForBitWidth(Size);
+ assert(ARC && "Invalid register class size");
+ return ARC;
+}
+
+const TargetRegisterClass *
SIRegisterInfo::getEquivalentSGPRClass(const TargetRegisterClass *VRC) const {
unsigned Size = getRegSizeInBits(*VRC);
if (Size == 32)
@@ -3734,27 +3781,6 @@ bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI,
return RC && isAGPRClass(RC);
}
-bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
- const TargetRegisterClass *SrcRC,
- unsigned SubReg,
- const TargetRegisterClass *DstRC,
- unsigned DstSubReg,
- const TargetRegisterClass *NewRC,
- LiveIntervals &LIS) const {
- unsigned SrcSize = getRegSizeInBits(*SrcRC);
- unsigned DstSize = getRegSizeInBits(*DstRC);
- unsigned NewSize = getRegSizeInBits(*NewRC);
-
- // Do not increase size of registers beyond dword, we would need to allocate
- // adjacent registers and constraint regalloc more than needed.
-
- // Always allow dword coalescing.
- if (SrcSize <= 32 || DstSize <= 32)
- return true;
-
- return NewSize <= DstSize || NewSize <= SrcSize;
-}
-
unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
MachineFunction &MF) const {
unsigned MinOcc = ST.getOccupancyWithWorkGroupSizes(MF).first;
@@ -3788,10 +3814,10 @@ unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
llvm_unreachable("Unexpected register pressure set!");
}
-const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
+const int *SIRegisterInfo::getRegUnitPressureSets(MCRegUnit RegUnit) const {
static const int Empty[] = { -1 };
- if (RegPressureIgnoredUnits[RegUnit])
+ if (RegPressureIgnoredUnits[static_cast<unsigned>(RegUnit)])
return Empty;
return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit);
@@ -3915,20 +3941,6 @@ const TargetRegisterClass *SIRegisterInfo::getVGPR64Class() const {
: &AMDGPU::VReg_64RegClass;
}
-const TargetRegisterClass *
-SIRegisterInfo::getRegClass(unsigned RCID) const {
- switch ((int)RCID) {
- case AMDGPU::SReg_1RegClassID:
- return getBoolRC();
- case AMDGPU::SReg_1_XEXECRegClassID:
- return getWaveMaskRegClass();
- case -1:
- return nullptr;
- default:
- return AMDGPUGenRegisterInfo::getRegClass(RCID);
- }
-}
-
// Find reaching register definition
MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg,
MachineInstr &Use,
@@ -4017,28 +4029,6 @@ bool SIRegisterInfo::isProperlyAlignedRC(const TargetRegisterClass &RC) const {
return true;
}
-const TargetRegisterClass *
-SIRegisterInfo::getProperlyAlignedRC(const TargetRegisterClass *RC) const {
- if (!RC || !ST.needsAlignedVGPRs())
- return RC;
-
- unsigned Size = getRegSizeInBits(*RC);
- if (Size <= 32)
- return RC;
-
- if (RC == &AMDGPU::VS_64RegClass)
- return &AMDGPU::VS_64_Align2RegClass;
-
- if (isVGPRClass(RC))
- return getAlignedVGPRClassForBitWidth(Size);
- if (isAGPRClass(RC))
- return getAlignedAGPRClassForBitWidth(Size);
- if (isVectorSuperClass(RC))
- return getAlignedVectorSuperClassForBitWidth(Size);
-
- return RC;
-}
-
ArrayRef<MCPhysReg>
SIRegisterInfo::getAllSGPR128(const MachineFunction &MF) const {
return ArrayRef(AMDGPU::SGPR_128RegClass.begin(), ST.getMaxNumSGPRs(MF) / 4);
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 7b91ba7..2e2916f 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -216,6 +216,10 @@ public:
getVectorSuperClassForBitWidth(unsigned BitWidth) const;
LLVM_READONLY
+ const TargetRegisterClass *
+ getDefaultVectorSuperClassForBitWidth(unsigned BitWidth) const;
+
+ LLVM_READONLY
static const TargetRegisterClass *getSGPRClassForBitWidth(unsigned BitWidth);
/// \returns true if this class contains only SGPR registers
@@ -285,6 +289,10 @@ public:
const TargetRegisterClass *
getEquivalentAGPRClass(const TargetRegisterClass *SRC) const;
+ /// \returns An AGPR+VGPR super reg class with the same width as \p SRC
+ const TargetRegisterClass *
+ getEquivalentAVClass(const TargetRegisterClass *SRC) const;
+
/// \returns A SGPR reg class with the same width as \p SRC
const TargetRegisterClass *
getEquivalentSGPRClass(const TargetRegisterClass *VRC) const;
@@ -338,14 +346,6 @@ public:
ArrayRef<int16_t> getRegSplitParts(const TargetRegisterClass *RC,
unsigned EltSize) const;
- bool shouldCoalesce(MachineInstr *MI,
- const TargetRegisterClass *SrcRC,
- unsigned SubReg,
- const TargetRegisterClass *DstRC,
- unsigned DstSubReg,
- const TargetRegisterClass *NewRC,
- LiveIntervals &LIS) const override;
-
unsigned getRegPressureLimit(const TargetRegisterClass *RC,
MachineFunction &MF) const override;
@@ -357,7 +357,7 @@ public:
const MachineFunction &MF, const VirtRegMap *VRM,
const LiveRegMatrix *Matrix) const override;
- const int *getRegUnitPressureSets(unsigned RegUnit) const override;
+ const int *getRegUnitPressureSets(MCRegUnit RegUnit) const override;
MCRegister getReturnAddressReg(const MachineFunction &MF) const;
@@ -391,8 +391,6 @@ public:
MCRegister getExec() const;
- const TargetRegisterClass *getRegClass(unsigned RCID) const;
-
// Find reaching register definition
MachineInstr *findReachingDef(Register Reg, unsigned SubReg,
MachineInstr &Use,
@@ -433,11 +431,6 @@ public:
// the subtarget.
bool isProperlyAlignedRC(const TargetRegisterClass &RC) const;
- // Given \p RC returns corresponding aligned register class if required
- // by the subtarget.
- const TargetRegisterClass *
- getProperlyAlignedRC(const TargetRegisterClass *RC) const;
-
/// Return all SGPR128 which satisfy the waves per execution unit requirement
/// of the subtarget.
ArrayRef<MCPhysReg> getAllSGPR128(const MachineFunction &MF) const;
@@ -495,6 +488,17 @@ public:
SmallVector<StringLiteral>
getVRegFlagsOfReg(Register Reg, const MachineFunction &MF) const override;
+
+ float
+ getSpillWeightScaleFactor(const TargetRegisterClass *RC) const override {
+ // Prioritize VGPR_32_Lo256 over other classes which may occupy registers
+ // beyond v256.
+ return AMDGPUGenRegisterInfo::getSpillWeightScaleFactor(RC) *
+ ((RC == &AMDGPU::VGPR_32_Lo256RegClass ||
+ RC == &AMDGPU::VReg_64_Lo256_Align2RegClass)
+ ? 2.0
+ : 1.0);
+ }
};
namespace AMDGPU {
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index fc8f46a..272d4b5 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -644,7 +644,7 @@ def VGPR_32_Lo128 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg1
// Identical to VGPR_32 except it only contains the low 256 (Lo256) registers.
def VGPR_32_Lo256 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32,
(add (sequence "VGPR%u", 0, 255))> {
- let AllocationPriority = 0;
+ let AllocationPriority = !add(3, !mul(BaseClassPriority, BaseClassScaleFactor));
let GeneratePressureSet = 0;
let Size = 32;
let Weight = 1;
@@ -896,20 +896,6 @@ def SReg_64_Encodable : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v
let Size = 64;
}
-def SReg_1_XEXEC : SIRegisterClass<"AMDGPU", [i1], 32,
- (add SReg_64_XEXEC, SReg_32_XEXEC)> {
- let CopyCost = 1;
- let isAllocatable = 0;
- let HasSGPR = 1;
-}
-
-def SReg_1 : SIRegisterClass<"AMDGPU", [i1], 32,
- (add SReg_1_XEXEC, EXEC, EXEC_LO, EXEC_HI)> {
- let CopyCost = 1;
- let isAllocatable = 0;
- let HasSGPR = 1;
-}
-
multiclass SRegClass<int numRegs,
list<ValueType> regTypes,
SIRegisterTuples regList,
@@ -1205,6 +1191,34 @@ defm AV_512 : AVRegClass<16, VReg_512.RegTypes, (add VGPR_512), (add AGPR_512)>;
defm AV_1024 : AVRegClass<32, VReg_1024.RegTypes, (add VGPR_1024), (add AGPR_1024)>;
}
+def SReg_1_XEXEC : SIRegisterClassLike<0, false, false, true>,
+ RegClassByHwMode<
+ [DefaultMode_Wave64,
+ AlignedVGPRNoAGPRMode_Wave64,
+ AVAlign2LoadStoreMode,
+ DefaultMode_Wave32,
+ AlignedVGPRNoAGPRMode_Wave32],
+ [SReg_64_XEXEC,
+ SReg_64_XEXEC,
+ SReg_64_XEXEC,
+ SReg_32_XM0_XEXEC, // FIXME: Why do the wave32 cases exclude m0?
+ SReg_32_XM0_XEXEC]
+>;
+
+def SReg_1 : SIRegisterClassLike<0, false, false, true>,
+ RegClassByHwMode<
+ [DefaultMode_Wave64,
+ AlignedVGPRNoAGPRMode_Wave64,
+ AVAlign2LoadStoreMode,
+ DefaultMode_Wave32,
+ AlignedVGPRNoAGPRMode_Wave32],
+ [SReg_64,
+ SReg_64,
+ SReg_64,
+ SReg_32,
+ SReg_32]
+>;
+
//===----------------------------------------------------------------------===//
//
// AlignTarget classes. Artifical classes to swap between
@@ -1212,17 +1226,36 @@ defm AV_1024 : AVRegClass<32, VReg_1024.RegTypes, (add VGPR_1024), (add AGPR_102
//
//===----------------------------------------------------------------------===//
+// We have 3 orthogonal properties to consider. Unfortunately we need
+// to define the cross product of these states, minus unused
+// combinations.
+
def AV_LdSt_32_Target : RegClassByHwMode<
- [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode],
- [VGPR_32, AV_32, VGPR_32]>, SIRegisterClassLike<32, true, true> {
+ [DefaultMode_Wave64,
+ DefaultMode_Wave32,
+ AVAlign2LoadStoreMode,
+ AlignedVGPRNoAGPRMode_Wave64,
+ AlignedVGPRNoAGPRMode_Wave32],
+ [VGPR_32,
+ VGPR_32,
+ AV_32,
+ VGPR_32,
+ VGPR_32]>,
+ SIRegisterClassLike<32, true, true> {
let DecoderMethod = "decodeAVLdSt";
}
foreach RegSize = [ 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 512, 1024 ] in {
def VReg_#RegSize#_AlignTarget : SIRegisterClassLike<RegSize, true>,
RegClassByHwMode<
- [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode],
+ [DefaultMode_Wave64,
+ DefaultMode_Wave32,
+ AVAlign2LoadStoreMode,
+ AlignedVGPRNoAGPRMode_Wave64,
+ AlignedVGPRNoAGPRMode_Wave32],
[!cast<RegisterClass>("VReg_"#RegSize),
+ !cast<RegisterClass>("VReg_"#RegSize),
+ !cast<RegisterClass>("VReg_"#RegSize#_Align2),
!cast<RegisterClass>("VReg_"#RegSize#_Align2),
!cast<RegisterClass>("VReg_"#RegSize#_Align2)]> {
let DecoderMethod = "DecodeVReg_"#RegSize#"RegisterClass";
@@ -1230,45 +1263,59 @@ foreach RegSize = [ 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 512, 10
def AReg_#RegSize#_AlignTarget : SIRegisterClassLike<RegSize, false, true>,
RegClassByHwMode<
- [DefaultMode, AVAlign2LoadStoreMode, /*Unused combination*/],
+ [DefaultMode_Wave64, /*unused combination*/ AVAlign2LoadStoreMode, /*Unused combination*/ /*Unused combination*/],
[!cast<RegisterClass>("AReg_"#RegSize),
+ /*unused combination*/
!cast<RegisterClass>("AReg_"#RegSize#_Align2)
+ /*Unused combination*/
/*Unused combination*/]> {
let DecoderMethod = "DecodeAReg_"#RegSize#"RegisterClass";
}
def AV_#RegSize#_AlignTarget : SIRegisterClassLike<RegSize, true, true>,
RegClassByHwMode<
- [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode],
+ [DefaultMode_Wave32,
+ DefaultMode_Wave64,
+ AVAlign2LoadStoreMode,
+ AlignedVGPRNoAGPRMode_Wave64,
+ AlignedVGPRNoAGPRMode_Wave32],
[!cast<RegisterClass>("AV_"#RegSize),
+ !cast<RegisterClass>("AV_"#RegSize),
!cast<RegisterClass>("AV_"#RegSize#_Align2),
+ !cast<RegisterClass>("VReg_"#RegSize#_Align2),
!cast<RegisterClass>("VReg_"#RegSize#_Align2)]> {
let DecoderMethod = "DecodeAV_"#RegSize#"RegisterClass";
}
def AV_LdSt_#RegSize#_AlignTarget : SIRegisterClassLike<RegSize, true, true>,
RegClassByHwMode<
- [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode],
+ [DefaultMode_Wave64, DefaultMode_Wave32, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode_Wave64, AlignedVGPRNoAGPRMode_Wave32],
[!cast<RegisterClass>("VReg_"#RegSize),
+ !cast<RegisterClass>("VReg_"#RegSize),
!cast<RegisterClass>("AV_"#RegSize#_Align2),
+ !cast<RegisterClass>("VReg_"#RegSize#_Align2),
!cast<RegisterClass>("VReg_"#RegSize#_Align2)]> {
let DecoderMethod = "decodeAVLdSt";
}
def AV_LdSt_#RegSize#_Align2 : SIRegisterClassLike<RegSize, true, true>,
RegClassByHwMode<
- [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode],
+ [DefaultMode_Wave64, DefaultMode_Wave32, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode_Wave64, AlignedVGPRNoAGPRMode_Wave32],
[!cast<RegisterClass>("VReg_"#RegSize#_Align2),
+ !cast<RegisterClass>("VReg_"#RegSize#_Align2),
!cast<RegisterClass>("AV_"#RegSize#_Align2),
+ !cast<RegisterClass>("VReg_"#RegSize#_Align2),
!cast<RegisterClass>("VReg_"#RegSize#_Align2)]> {
let DecoderMethod = "decodeAVLdSt";
}
def AV_LdSt_#RegSize#_Align1 : SIRegisterClassLike<RegSize, true, true>,
RegClassByHwMode<
- [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode],
+ [DefaultMode_Wave64, DefaultMode_Wave32, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode_Wave64, AlignedVGPRNoAGPRMode_Wave32],
[!cast<RegisterClass>("VReg_"#RegSize),
+ !cast<RegisterClass>("VReg_"#RegSize),
!cast<RegisterClass>("AV_"#RegSize),
+ !cast<RegisterClass>("VReg_"#RegSize),
!cast<RegisterClass>("VReg_"#RegSize)]> {
let DecoderMethod = "decodeAVLdSt";
}
@@ -1276,11 +1323,22 @@ foreach RegSize = [ 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 512, 10
def VS_64_AlignTarget : SIRegisterClassLike<64, true, false, true>,
RegClassByHwMode<
- [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode],
- [VS_64, VS_64_Align2, VS_64_Align2]> {
+ [DefaultMode_Wave64, DefaultMode_Wave32, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode_Wave64, AlignedVGPRNoAGPRMode_Wave32],
+ [VS_64, VS_64, VS_64_Align2, VS_64_Align2, VS_64_Align2]> {
let DecoderMethod = "decodeSrcRegOrImm9";
}
+
+// Special case for DS_GWS instructions. The register input is really
+// 32-bit, but it needs to be even aligned on targets with a VGPR
+// alignment requirement.
+def AV_LdSt_32_Align2 : SIRegisterClassLike</*Bitwidth=*/32, /*VGPR=*/true, /*AGPR=*/true>,
+ RegClassByHwMode<
+ [DefaultMode_Wave64, DefaultMode_Wave32, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode_Wave64, AlignedVGPRNoAGPRMode_Wave32],
+ [VGPR_32, VGPR_32, AV_64_Align2, VReg_64_Align2, VReg_64_Align2]> {
+ let DecoderMethod = "decodeAVLdSt<32>";
+}
+
class RegImmMatcher<string name> : AsmOperandClass {
let Name = name;
let RenderMethod = "addRegOrImmOperands";
@@ -1533,6 +1591,17 @@ foreach size = ["64", "96", "128", "160", "256", "1024" ] in {
def AVLdSt_#size#_Align2 : AVLdStOperand<!cast<RegisterClassLike>("AV_LdSt_"#size#_Align2)>;
}
+def AV_LdSt_32_Align2_RegMatcher : AsmOperandClass {
+ let Name = "AV_LdSt_32_Align2_RegOp";
+ let RenderMethod = "addRegOperands";
+}
+
+def AV_LdSt_32_Align2_RegOp : RegisterOperand<AV_LdSt_32_Align2> {
+ let ParserMatchClass = AV_LdSt_32_Align2_RegMatcher;
+ let PrintMethod = "printAVLdSt32Align2RegOp";
+ let EncoderMethod = "getAVOperandEncoding";
+}
+
//===----------------------------------------------------------------------===//
// ACSrc_* Operands with an AGPR or an inline constant
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 179ecba..1b78f67 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -550,7 +550,8 @@ bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const {
uint32_t NewImm = 0;
if (Opc == AMDGPU::S_AND_B32) {
- if (isPowerOf2_32(~Imm)) {
+ if (isPowerOf2_32(~Imm) &&
+ MI.findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr)->isDead()) {
NewImm = llvm::countr_one(Imm);
Opc = AMDGPU::S_BITSET0_B32;
} else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) {
@@ -558,7 +559,8 @@ bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const {
Opc = AMDGPU::S_ANDN2_B32;
}
} else if (Opc == AMDGPU::S_OR_B32) {
- if (isPowerOf2_32(Imm)) {
+ if (isPowerOf2_32(Imm) &&
+ MI.findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr)->isDead()) {
NewImm = llvm::countr_zero(Imm);
Opc = AMDGPU::S_BITSET1_B32;
} else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) {
@@ -584,7 +586,7 @@ bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const {
if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) {
const bool IsUndef = SrcReg->isUndef();
const bool IsKill = SrcReg->isKill();
- MI.setDesc(TII->get(Opc));
+ TII->mutateAndCleanupImplicit(MI, TII->get(Opc));
if (Opc == AMDGPU::S_BITSET0_B32 ||
Opc == AMDGPU::S_BITSET1_B32) {
Src0->ChangeToImmediate(NewImm);
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 6611e1e..10762ed 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -188,8 +188,9 @@ private:
void markInstruction(MachineInstr &MI, char Flag,
std::vector<WorkItem> &Worklist);
- void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg,
- unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist);
+ void markDefs(const MachineInstr &UseMI, LiveRange &LR,
+ VirtRegOrUnit VRegOrUnit, unsigned SubReg, char Flag,
+ std::vector<WorkItem> &Worklist);
void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag,
std::vector<WorkItem> &Worklist);
void markInstructionUses(const MachineInstr &MI, char Flag,
@@ -318,8 +319,8 @@ void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
/// Mark all relevant definitions of register \p Reg in usage \p UseMI.
void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
- Register Reg, unsigned SubReg, char Flag,
- std::vector<WorkItem> &Worklist) {
+ VirtRegOrUnit VRegOrUnit, unsigned SubReg,
+ char Flag, std::vector<WorkItem> &Worklist) {
LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
@@ -331,8 +332,9 @@ void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
// cover registers.
const LaneBitmask UseLanes =
SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
- : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg)
- : LaneBitmask::getNone());
+ : (VRegOrUnit.isVirtualReg()
+ ? MRI->getMaxLaneMaskForVReg(VRegOrUnit.asVirtualReg())
+ : LaneBitmask::getNone());
// Perform a depth-first iteration of the LiveRange graph marking defs.
// Stop processing of a given branch when all use lanes have been defined.
@@ -382,11 +384,11 @@ void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
assert(MI && "Def has no defining instruction");
- if (Reg.isVirtual()) {
+ if (VRegOrUnit.isVirtualReg()) {
// Iterate over all operands to find relevant definitions
bool HasDef = false;
for (const MachineOperand &Op : MI->all_defs()) {
- if (Op.getReg() != Reg)
+ if (Op.getReg() != VRegOrUnit.asVirtualReg())
continue;
// Compute lanes defined and overlap with use
@@ -453,7 +455,7 @@ void SIWholeQuadMode::markOperand(const MachineInstr &MI,
<< " for " << MI);
if (Reg.isVirtual()) {
LiveRange &LR = LIS->getInterval(Reg);
- markDefs(MI, LR, Reg, Op.getSubReg(), Flag, Worklist);
+ markDefs(MI, LR, VirtRegOrUnit(Reg), Op.getSubReg(), Flag, Worklist);
} else {
// Handle physical registers that we need to track; this is mostly relevant
// for VCC, which can appear as the (implicit) input of a uniform branch,
@@ -462,7 +464,8 @@ void SIWholeQuadMode::markOperand(const MachineInstr &MI,
LiveRange &LR = LIS->getRegUnit(Unit);
const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
if (Value)
- markDefs(MI, LR, Unit, AMDGPU::NoSubRegister, Flag, Worklist);
+ markDefs(MI, LR, VirtRegOrUnit(Unit), AMDGPU::NoSubRegister, Flag,
+ Worklist);
}
}
}
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
index 6489e63..ce782b0 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
@@ -211,6 +211,7 @@ static constexpr CustomOperand Operands[] = {
{{"HW_REG_HW_ID2"}, ID_HW_ID2, isGFX10Plus},
{{"HW_REG_SQ_PERF_SNAPSHOT_PC_HI"}, ID_SQ_PERF_SNAPSHOT_PC_HI, isGFX940},
{{"HW_REG_POPS_PACKER"}, ID_POPS_PACKER, isGFX10},
+ {{"HW_REG_WAVE_SCHED_MODE"}, ID_SCHED_MODE, isGFX12Plus},
{{"HW_REG_PERF_SNAPSHOT_DATA"}, ID_PERF_SNAPSHOT_DATA_gfx11, isGFX11},
{{"HW_REG_IB_STS2"}, ID_IB_STS2, isGFX1250},
{{"HW_REG_SHADER_CYCLES"}, ID_SHADER_CYCLES, isGFX10_3_GFX11},
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 3e1b058..c6e061f 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -729,6 +729,8 @@ bool isGenericAtomic(unsigned Opc) {
Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN ||
Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX ||
Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP ||
+ Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32 ||
+ Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32 ||
Opc == AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG;
}
@@ -897,7 +899,7 @@ unsigned ComponentInfo::getIndexInParsedOperands(unsigned CompOprIdx) const {
}
std::optional<unsigned> InstInfo::getInvalidCompOperandIndex(
- std::function<unsigned(unsigned, unsigned)> GetRegIdx,
+ std::function<MCRegister(unsigned, unsigned)> GetRegIdx,
const MCRegisterInfo &MRI, bool SkipSrc, bool AllowSameVGPR,
bool VOPD3) const {
@@ -914,12 +916,13 @@ std::optional<unsigned> InstInfo::getInvalidCompOperandIndex(
BaseX = X;
if (!BaseY)
BaseY = Y;
- if ((BaseX & BanksMask) == (BaseY & BanksMask))
+ if ((BaseX.id() & BanksMask) == (BaseY.id() & BanksMask))
return true;
if (BaseX != X /* This is 64-bit register */ &&
- ((BaseX + 1) & BanksMask) == (BaseY & BanksMask))
+ ((BaseX.id() + 1) & BanksMask) == (BaseY.id() & BanksMask))
return true;
- if (BaseY != Y && (BaseX & BanksMask) == ((BaseY + 1) & BanksMask))
+ if (BaseY != Y &&
+ (BaseX.id() & BanksMask) == ((BaseY.id() + 1) & BanksMask))
return true;
// If both are 64-bit bank conflict will be detected yet while checking
@@ -968,7 +971,7 @@ std::optional<unsigned> InstInfo::getInvalidCompOperandIndex(
// if the operand is not a register or not a VGPR.
InstInfo::RegIndices
InstInfo::getRegIndices(unsigned CompIdx,
- std::function<unsigned(unsigned, unsigned)> GetRegIdx,
+ std::function<MCRegister(unsigned, unsigned)> GetRegIdx,
bool VOPD3) const {
assert(CompIdx < COMPONENTS_NUM);
@@ -983,7 +986,7 @@ InstInfo::getRegIndices(unsigned CompIdx,
Comp.hasRegSrcOperand(CompSrcIdx)
? GetRegIdx(CompIdx,
Comp.getIndexOfSrcInMCOperands(CompSrcIdx, VOPD3))
- : 0;
+ : MCRegister();
}
return RegIndices;
}
@@ -2051,56 +2054,63 @@ unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc) {
return packBits(VmVsrc, Encoded, getVmVsrcBitShift(), getVmVsrcBitWidth());
}
-unsigned encodeFieldVmVsrc(unsigned VmVsrc) {
- return encodeFieldVmVsrc(0xffff, VmVsrc);
+unsigned encodeFieldVmVsrc(unsigned VmVsrc, const MCSubtargetInfo &STI) {
+ unsigned Encoded = getDefaultDepCtrEncoding(STI);
+ return encodeFieldVmVsrc(Encoded, VmVsrc);
}
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst) {
return packBits(VaVdst, Encoded, getVaVdstBitShift(), getVaVdstBitWidth());
}
-unsigned encodeFieldVaVdst(unsigned VaVdst) {
- return encodeFieldVaVdst(0xffff, VaVdst);
+unsigned encodeFieldVaVdst(unsigned VaVdst, const MCSubtargetInfo &STI) {
+ unsigned Encoded = getDefaultDepCtrEncoding(STI);
+ return encodeFieldVaVdst(Encoded, VaVdst);
}
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst) {
return packBits(SaSdst, Encoded, getSaSdstBitShift(), getSaSdstBitWidth());
}
-unsigned encodeFieldSaSdst(unsigned SaSdst) {
- return encodeFieldSaSdst(0xffff, SaSdst);
+unsigned encodeFieldSaSdst(unsigned SaSdst, const MCSubtargetInfo &STI) {
+ unsigned Encoded = getDefaultDepCtrEncoding(STI);
+ return encodeFieldSaSdst(Encoded, SaSdst);
}
unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst) {
return packBits(VaSdst, Encoded, getVaSdstBitShift(), getVaSdstBitWidth());
}
-unsigned encodeFieldVaSdst(unsigned VaSdst) {
- return encodeFieldVaSdst(0xffff, VaSdst);
+unsigned encodeFieldVaSdst(unsigned VaSdst, const MCSubtargetInfo &STI) {
+ unsigned Encoded = getDefaultDepCtrEncoding(STI);
+ return encodeFieldVaSdst(Encoded, VaSdst);
}
unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc) {
return packBits(VaVcc, Encoded, getVaVccBitShift(), getVaVccBitWidth());
}
-unsigned encodeFieldVaVcc(unsigned VaVcc) {
- return encodeFieldVaVcc(0xffff, VaVcc);
+unsigned encodeFieldVaVcc(unsigned VaVcc, const MCSubtargetInfo &STI) {
+ unsigned Encoded = getDefaultDepCtrEncoding(STI);
+ return encodeFieldVaVcc(Encoded, VaVcc);
}
unsigned encodeFieldVaSsrc(unsigned Encoded, unsigned VaSsrc) {
return packBits(VaSsrc, Encoded, getVaSsrcBitShift(), getVaSsrcBitWidth());
}
-unsigned encodeFieldVaSsrc(unsigned VaSsrc) {
- return encodeFieldVaSsrc(0xffff, VaSsrc);
+unsigned encodeFieldVaSsrc(unsigned VaSsrc, const MCSubtargetInfo &STI) {
+ unsigned Encoded = getDefaultDepCtrEncoding(STI);
+ return encodeFieldVaSsrc(Encoded, VaSsrc);
}
unsigned encodeFieldHoldCnt(unsigned Encoded, unsigned HoldCnt) {
return packBits(HoldCnt, Encoded, getHoldCntBitShift(), getHoldCntWidth());
}
-unsigned encodeFieldHoldCnt(unsigned HoldCnt) {
- return encodeFieldHoldCnt(0xffff, HoldCnt);
+unsigned encodeFieldHoldCnt(unsigned HoldCnt, const MCSubtargetInfo &STI) {
+ unsigned Encoded = getDefaultDepCtrEncoding(STI);
+ return encodeFieldHoldCnt(Encoded, HoldCnt);
}
} // namespace DepCtr
@@ -2697,8 +2707,8 @@ MCRegister getMCReg(MCRegister Reg, const MCSubtargetInfo &STI) {
MCRegister mc2PseudoReg(MCRegister Reg) { MAP_REG2REG }
-bool isInlineValue(unsigned Reg) {
- switch (Reg) {
+bool isInlineValue(MCRegister Reg) {
+ switch (Reg.id()) {
case AMDGPU::SRC_SHARED_BASE_LO:
case AMDGPU::SRC_SHARED_BASE:
case AMDGPU::SRC_SHARED_LIMIT_LO:
@@ -3361,7 +3371,7 @@ const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t Format,
: getGfx9BufferFormatInfo(Format);
}
-const MCRegisterClass *getVGPRPhysRegClass(MCPhysReg Reg,
+const MCRegisterClass *getVGPRPhysRegClass(MCRegister Reg,
const MCRegisterInfo &MRI) {
const unsigned VGPRClasses[] = {
AMDGPU::VGPR_16RegClassID, AMDGPU::VGPR_32RegClassID,
@@ -3382,22 +3392,22 @@ const MCRegisterClass *getVGPRPhysRegClass(MCPhysReg Reg,
return nullptr;
}
-unsigned getVGPREncodingMSBs(MCPhysReg Reg, const MCRegisterInfo &MRI) {
+unsigned getVGPREncodingMSBs(MCRegister Reg, const MCRegisterInfo &MRI) {
unsigned Enc = MRI.getEncodingValue(Reg);
unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK;
return Idx >> 8;
}
-MCPhysReg getVGPRWithMSBs(MCPhysReg Reg, unsigned MSBs,
- const MCRegisterInfo &MRI) {
+MCRegister getVGPRWithMSBs(MCRegister Reg, unsigned MSBs,
+ const MCRegisterInfo &MRI) {
unsigned Enc = MRI.getEncodingValue(Reg);
unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK;
if (Idx >= 0x100)
- return AMDGPU::NoRegister;
+ return MCRegister();
const MCRegisterClass *RC = getVGPRPhysRegClass(Reg, MRI);
if (!RC)
- return AMDGPU::NoRegister;
+ return MCRegister();
Idx |= MSBs << 8;
if (RC->getID() == AMDGPU::VGPR_16RegClassID) {
@@ -3438,17 +3448,42 @@ getVGPRLoweringOperandTables(const MCInstrDesc &Desc) {
AMDGPU::OpName::src0Y, AMDGPU::OpName::vsrc1Y, AMDGPU::OpName::vsrc2Y,
AMDGPU::OpName::vdstY};
+ // VOP2 MADMK instructions use src0, imm, src1 scheme.
+ static const AMDGPU::OpName VOP2MADMKOps[4] = {
+ AMDGPU::OpName::src0, AMDGPU::OpName::NUM_OPERAND_NAMES,
+ AMDGPU::OpName::src1, AMDGPU::OpName::vdst};
+ static const AMDGPU::OpName VOPDFMAMKOpsX[4] = {
+ AMDGPU::OpName::src0X, AMDGPU::OpName::NUM_OPERAND_NAMES,
+ AMDGPU::OpName::vsrc1X, AMDGPU::OpName::vdstX};
+ static const AMDGPU::OpName VOPDFMAMKOpsY[4] = {
+ AMDGPU::OpName::src0Y, AMDGPU::OpName::NUM_OPERAND_NAMES,
+ AMDGPU::OpName::vsrc1Y, AMDGPU::OpName::vdstY};
+
unsigned TSFlags = Desc.TSFlags;
if (TSFlags &
(SIInstrFlags::VOP1 | SIInstrFlags::VOP2 | SIInstrFlags::VOP3 |
SIInstrFlags::VOP3P | SIInstrFlags::VOPC | SIInstrFlags::DPP)) {
+ switch (Desc.getOpcode()) {
// LD_SCALE operands ignore MSB.
- if (Desc.getOpcode() == AMDGPU::V_WMMA_LD_SCALE_PAIRED_B32 ||
- Desc.getOpcode() == AMDGPU::V_WMMA_LD_SCALE_PAIRED_B32_gfx1250 ||
- Desc.getOpcode() == AMDGPU::V_WMMA_LD_SCALE16_PAIRED_B64 ||
- Desc.getOpcode() == AMDGPU::V_WMMA_LD_SCALE16_PAIRED_B64_gfx1250)
+ case AMDGPU::V_WMMA_LD_SCALE_PAIRED_B32:
+ case AMDGPU::V_WMMA_LD_SCALE_PAIRED_B32_gfx1250:
+ case AMDGPU::V_WMMA_LD_SCALE16_PAIRED_B64:
+ case AMDGPU::V_WMMA_LD_SCALE16_PAIRED_B64_gfx1250:
return {};
+ case AMDGPU::V_FMAMK_F16:
+ case AMDGPU::V_FMAMK_F16_t16:
+ case AMDGPU::V_FMAMK_F16_t16_gfx12:
+ case AMDGPU::V_FMAMK_F16_fake16:
+ case AMDGPU::V_FMAMK_F16_fake16_gfx12:
+ case AMDGPU::V_FMAMK_F32:
+ case AMDGPU::V_FMAMK_F32_gfx12:
+ case AMDGPU::V_FMAMK_F64:
+ case AMDGPU::V_FMAMK_F64_gfx1250:
+ return {VOP2MADMKOps, nullptr};
+ default:
+ break;
+ }
return {VOPOps, nullptr};
}
@@ -3464,8 +3499,11 @@ getVGPRLoweringOperandTables(const MCInstrDesc &Desc) {
if (TSFlags & SIInstrFlags::VIMAGE)
return {VIMGOps, nullptr};
- if (AMDGPU::isVOPD(Desc.getOpcode()))
- return {VOPDOpsX, VOPDOpsY};
+ if (AMDGPU::isVOPD(Desc.getOpcode())) {
+ auto [OpX, OpY] = getVOPDComponents(Desc.getOpcode());
+ return {(OpX == AMDGPU::V_FMAMK_F32) ? VOPDFMAMKOpsX : VOPDOpsX,
+ (OpY == AMDGPU::V_FMAMK_F32) ? VOPDFMAMKOpsY : VOPDOpsY};
+ }
assert(!(TSFlags & SIInstrFlags::MIMG));
@@ -3545,8 +3583,15 @@ bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCInstrInfo &MII,
}
unsigned getLdsDwGranularity(const MCSubtargetInfo &ST) {
- return ST.hasFeature(AMDGPU::FeatureAddressableLocalMemorySize327680) ? 256
- : 128;
+ if (ST.getFeatureBits().test(FeatureAddressableLocalMemorySize32768))
+ return 64;
+ if (ST.getFeatureBits().test(FeatureAddressableLocalMemorySize65536))
+ return 128;
+ if (ST.getFeatureBits().test(FeatureAddressableLocalMemorySize163840))
+ return 320;
+ if (ST.getFeatureBits().test(FeatureAddressableLocalMemorySize327680))
+ return 512;
+ return 64; // In sync with getAddressableLocalMemorySize
}
bool isPackedFP32Inst(unsigned Opc) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 5e3195b..3a35200 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -909,7 +909,7 @@ private:
const ComponentInfo CompInfo[COMPONENTS_NUM];
public:
- using RegIndices = std::array<unsigned, Component::MAX_OPR_NUM>;
+ using RegIndices = std::array<MCRegister, Component::MAX_OPR_NUM>;
InstInfo(const MCInstrDesc &OpX, const MCInstrDesc &OpY)
: CompInfo{OpX, OpY} {}
@@ -932,9 +932,10 @@ public:
// even though it violates requirement to be from different banks.
// If \p VOPD3 is set to true both dst registers allowed to be either odd
// or even and instruction may have real src2 as opposed to tied accumulator.
- bool hasInvalidOperand(std::function<unsigned(unsigned, unsigned)> GetRegIdx,
- const MCRegisterInfo &MRI, bool SkipSrc = false,
- bool AllowSameVGPR = false, bool VOPD3 = false) const {
+ bool
+ hasInvalidOperand(std::function<MCRegister(unsigned, unsigned)> GetRegIdx,
+ const MCRegisterInfo &MRI, bool SkipSrc = false,
+ bool AllowSameVGPR = false, bool VOPD3 = false) const {
return getInvalidCompOperandIndex(GetRegIdx, MRI, SkipSrc, AllowSameVGPR,
VOPD3)
.has_value();
@@ -949,14 +950,14 @@ public:
// If \p VOPD3 is set to true both dst registers allowed to be either odd
// or even and instruction may have real src2 as opposed to tied accumulator.
std::optional<unsigned> getInvalidCompOperandIndex(
- std::function<unsigned(unsigned, unsigned)> GetRegIdx,
+ std::function<MCRegister(unsigned, unsigned)> GetRegIdx,
const MCRegisterInfo &MRI, bool SkipSrc = false,
bool AllowSameVGPR = false, bool VOPD3 = false) const;
private:
RegIndices
getRegIndices(unsigned ComponentIdx,
- std::function<unsigned(unsigned, unsigned)> GetRegIdx,
+ std::function<MCRegister(unsigned, unsigned)> GetRegIdx,
bool VOPD3) const;
};
@@ -1300,43 +1301,43 @@ unsigned decodeFieldVaSsrc(unsigned Encoded);
unsigned decodeFieldHoldCnt(unsigned Encoded);
/// \returns \p VmVsrc as an encoded Depctr immediate.
-unsigned encodeFieldVmVsrc(unsigned VmVsrc);
+unsigned encodeFieldVmVsrc(unsigned VmVsrc, const MCSubtargetInfo &STI);
/// \returns \p Encoded combined with encoded \p VmVsrc.
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc);
/// \returns \p VaVdst as an encoded Depctr immediate.
-unsigned encodeFieldVaVdst(unsigned VaVdst);
+unsigned encodeFieldVaVdst(unsigned VaVdst, const MCSubtargetInfo &STI);
/// \returns \p Encoded combined with encoded \p VaVdst.
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst);
/// \returns \p SaSdst as an encoded Depctr immediate.
-unsigned encodeFieldSaSdst(unsigned SaSdst);
+unsigned encodeFieldSaSdst(unsigned SaSdst, const MCSubtargetInfo &STI);
/// \returns \p Encoded combined with encoded \p SaSdst.
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst);
/// \returns \p VaSdst as an encoded Depctr immediate.
-unsigned encodeFieldVaSdst(unsigned VaSdst);
+unsigned encodeFieldVaSdst(unsigned VaSdst, const MCSubtargetInfo &STI);
/// \returns \p Encoded combined with encoded \p VaSdst.
unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst);
/// \returns \p VaVcc as an encoded Depctr immediate.
-unsigned encodeFieldVaVcc(unsigned VaVcc);
+unsigned encodeFieldVaVcc(unsigned VaVcc, const MCSubtargetInfo &STI);
/// \returns \p Encoded combined with encoded \p VaVcc.
unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc);
/// \returns \p HoldCnt as an encoded Depctr immediate.
-unsigned encodeFieldHoldCnt(unsigned HoldCnt);
+unsigned encodeFieldHoldCnt(unsigned HoldCnt, const MCSubtargetInfo &STI);
/// \returns \p Encoded combined with encoded \p HoldCnt.
-unsigned encodeFieldHoldCnt(unsigned HoldCnt, unsigned Encoded);
+unsigned encodeFieldHoldCnt(unsigned Encoded, unsigned HoldCnt);
/// \returns \p VaSsrc as an encoded Depctr immediate.
-unsigned encodeFieldVaSsrc(unsigned VaSsrc);
+unsigned encodeFieldVaSsrc(unsigned VaSsrc, const MCSubtargetInfo &STI);
/// \returns \p Encoded combined with encoded \p VaSsrc.
unsigned encodeFieldVaSsrc(unsigned Encoded, unsigned VaSsrc);
@@ -1513,6 +1514,8 @@ constexpr inline bool isKernel(CallingConv::ID CC) {
}
}
+inline bool isKernel(const Function &F) { return isKernel(F.getCallingConv()); }
+
LLVM_READNONE
constexpr bool canGuaranteeTCO(CallingConv::ID CC) {
return CC == CallingConv::Fast;
@@ -1599,7 +1602,7 @@ LLVM_READNONE
MCRegister mc2PseudoReg(MCRegister Reg);
LLVM_READNONE
-bool isInlineValue(unsigned Reg);
+bool isInlineValue(MCRegister Reg);
/// Is this an AMDGPU specific source operand? These include registers,
/// inline constants, literals and mandatory literals (KImm).
@@ -1798,16 +1801,16 @@ bool isIntrinsicAlwaysUniform(unsigned IntrID);
/// \returns a register class for the physical register \p Reg if it is a VGPR
/// or nullptr otherwise.
-const MCRegisterClass *getVGPRPhysRegClass(MCPhysReg Reg,
+const MCRegisterClass *getVGPRPhysRegClass(MCRegister Reg,
const MCRegisterInfo &MRI);
/// \returns the MODE bits which have to be set by the S_SET_VGPR_MSB for the
/// physical register \p Reg.
-unsigned getVGPREncodingMSBs(MCPhysReg Reg, const MCRegisterInfo &MRI);
+unsigned getVGPREncodingMSBs(MCRegister Reg, const MCRegisterInfo &MRI);
/// If \p Reg is a low VGPR return a corresponding high VGPR with \p MSBs set.
-MCPhysReg getVGPRWithMSBs(MCPhysReg Reg, unsigned MSBs,
- const MCRegisterInfo &MRI);
+MCRegister getVGPRWithMSBs(MCRegister Reg, unsigned MSBs,
+ const MCRegisterInfo &MRI);
// Returns a table for the opcode with a given \p Desc to map the VGPR MSB
// set by the S_SET_VGPR_MSB to one of 4 sources. In case of VOPD returns 2
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 54f57e0..1d1e959 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -513,6 +513,13 @@ defm V_CVT_U16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_u16_f16",
defm V_CVT_I16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_i16_f16",
VOP_I16_F16_SPECIAL_OMOD, VOP_I16_F16_SPECIAL_OMOD_t16, VOP_I16_F16_SPECIAL_OMOD_fake16, fp_to_sint>;
+
+let HasClamp = 0, HasOMod = 0 in {
+def V_TRANS_BF16_Profile : VOPProfile <[bf16, bf16, untyped, untyped]>;
+def V_TRANS_BF16_t16_Profile : VOPProfile_True16 <VOP_BF16_BF16>;
+def V_TRANS_BF16_fake16_Profile : VOPProfile_Fake16 <VOP_BF16_BF16>;
+}
+
let TRANS = 1, SchedRW = [WriteTrans32] in {
defm V_RCP_F16 : VOP1Inst_t16 <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>;
defm V_SQRT_F16 : VOP1Inst_t16 <"v_sqrt_f16", VOP_F16_F16, any_amdgcn_sqrt>;
@@ -527,14 +534,30 @@ defm V_TANH_F16 : VOP1Inst_t16 <"v_tanh_f16", VOP_F16_F16, int_amdgcn_tanh>;
}
let SubtargetPredicate = HasBF16TransInsts in {
-defm V_TANH_BF16 : VOP1Inst_t16 <"v_tanh_bf16", VOP_BF16_BF16, int_amdgcn_tanh>;
-defm V_RCP_BF16 : VOP1Inst_t16 <"v_rcp_bf16", VOP_BF16_BF16, AMDGPUrcp>;
-defm V_SQRT_BF16 : VOP1Inst_t16 <"v_sqrt_bf16", VOP_BF16_BF16, any_amdgcn_sqrt>;
-defm V_RSQ_BF16 : VOP1Inst_t16 <"v_rsq_bf16", VOP_BF16_BF16, AMDGPUrsq>;
-defm V_LOG_BF16 : VOP1Inst_t16 <"v_log_bf16", VOP_BF16_BF16, AMDGPUlogf16>;
-defm V_EXP_BF16 : VOP1Inst_t16 <"v_exp_bf16", VOP_BF16_BF16, AMDGPUexpf16>;
-defm V_SIN_BF16 : VOP1Inst_t16 <"v_sin_bf16", VOP_BF16_BF16, AMDGPUsin>;
-defm V_COS_BF16 : VOP1Inst_t16 <"v_cos_bf16", VOP_BF16_BF16, AMDGPUcos>;
+defm V_TANH_BF16 : VOP1Inst_t16_with_profiles<"v_tanh_bf16", V_TRANS_BF16_Profile,
+ V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile,
+ int_amdgcn_tanh>;
+defm V_RCP_BF16 : VOP1Inst_t16_with_profiles<"v_rcp_bf16", V_TRANS_BF16_Profile,
+ V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile,
+ AMDGPUrcp>;
+defm V_SQRT_BF16 : VOP1Inst_t16_with_profiles<"v_sqrt_bf16", V_TRANS_BF16_Profile,
+ V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile,
+ any_amdgcn_sqrt>;
+defm V_RSQ_BF16 : VOP1Inst_t16_with_profiles<"v_rsq_bf16", V_TRANS_BF16_Profile,
+ V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile,
+ AMDGPUrsq>;
+defm V_LOG_BF16 : VOP1Inst_t16_with_profiles<"v_log_bf16", V_TRANS_BF16_Profile,
+ V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile,
+ AMDGPUlogf16>;
+defm V_EXP_BF16 : VOP1Inst_t16_with_profiles<"v_exp_bf16", V_TRANS_BF16_Profile,
+ V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile,
+ AMDGPUexpf16>;
+defm V_SIN_BF16 : VOP1Inst_t16_with_profiles<"v_sin_bf16", V_TRANS_BF16_Profile,
+ V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile,
+ AMDGPUsin>;
+defm V_COS_BF16 : VOP1Inst_t16_with_profiles<"v_cos_bf16", V_TRANS_BF16_Profile,
+ V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile,
+ AMDGPUcos>;
}
} // End TRANS = 1, SchedRW = [WriteTrans32]
defm V_FREXP_MANT_F16 : VOP1Inst_t16 <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>;
@@ -593,15 +616,15 @@ let SubtargetPredicate = isGFX9Plus in {
let isReMaterializable = 1 in
defm V_SAT_PK_U8_I16 : VOP1Inst_t16<"v_sat_pk_u8_i16", VOP_I16_I32>;
-
- let mayRaiseFPException = 0 in {
- defm V_CVT_NORM_I16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_norm_i16_f16",
- VOP_I16_F16_SPECIAL_OMOD, VOP_I16_F16_SPECIAL_OMOD_t16, VOP_I16_F16_SPECIAL_OMOD_fake16>;
- defm V_CVT_NORM_U16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_norm_u16_f16",
- VOP_I16_F16_SPECIAL_OMOD, VOP_I16_F16_SPECIAL_OMOD_t16, VOP_I16_F16_SPECIAL_OMOD_fake16>;
- } // End mayRaiseFPException = 0
} // End SubtargetPredicate = isGFX9Plus
+let mayRaiseFPException = 0, SubtargetPredicate = HasCvtNormInsts in {
+defm V_CVT_NORM_I16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_norm_i16_f16",
+ VOP_I16_F16_SPECIAL_OMOD, VOP_I16_F16_SPECIAL_OMOD_t16, VOP_I16_F16_SPECIAL_OMOD_fake16>;
+defm V_CVT_NORM_U16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_norm_u16_f16",
+ VOP_I16_F16_SPECIAL_OMOD, VOP_I16_F16_SPECIAL_OMOD_t16, VOP_I16_F16_SPECIAL_OMOD_fake16>;
+} // End mayRaiseFPException = 0, SubtargetPredicate = HasCvtNormInsts
+
let SubtargetPredicate = isGFX9Only in {
defm V_SCREEN_PARTITION_4SE_B32 : VOP1Inst <"v_screen_partition_4se_b32", VOP_I32_I32>;
} // End SubtargetPredicate = isGFX9Only
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index d87d250..11ce102 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -971,7 +971,7 @@ defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_I32_I32_I32, int_a
} // End IsNeverUniform = 1
defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_F32_F32_I32, any_fldexp>;
-let ReadsModeReg = 0, mayRaiseFPException = 0 in {
+let ReadsModeReg = 0, mayRaiseFPException = 0, SubtargetPredicate = HasCvtPkNormVOP2Insts in {
defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_V2I16_F32_F32, AMDGPUpknorm_i16_f32>;
defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_V2I16_F32_F32, AMDGPUpknorm_u16_f32>;
}
@@ -1266,14 +1266,14 @@ let Constraints = "$vdst = $src2",
defm V_FMAC_F32 : VOP2Inst_VOPD <"v_fmac_f32", VOP_MAC_F32, 0x0, "v_fmac_f32">;
} // End SubtargetPredicate = HasDLInsts
-let SubtargetPredicate = HasFmaLegacy32 in {
+let SubtargetPredicate = HasFmacLegacy32 in {
let Constraints = "$vdst = $src2",
isConvertibleToThreeAddress = 1,
isCommutable = 1 in
defm V_FMAC_LEGACY_F32 : VOP2Inst <"v_fmac_legacy_f32", VOP_MAC_LEGACY_F32>;
-} // End SubtargetPredicate = HasFmaLegacy32
+} // End SubtargetPredicate = HasFmacLegacy32
let SubtargetPredicate = HasFmacF64Inst,
Constraints = "$vdst = $src2",
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 05ba76a..faab9f3 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -151,7 +151,7 @@ class getInterp16Ins <bit HasSrc2, bit HasOMod,
class VOP3_INTERP16 <list<ValueType> ArgVT, bit OpSel = 0> : VOPProfile<ArgVT> {
let IsSingle = 1;
- let HasOMod = !ne(DstVT.Value, f16.Value);
+ let HasOMod = !ne(DstVT, f16);
let HasHigh = 1;
let HasOpSel = OpSel;
@@ -185,7 +185,8 @@ defm V_FMA_LEGACY_F32 : VOP3Inst <"v_fma_legacy_f32",
defm V_MAD_I32_I24 : VOP3Inst <"v_mad_i32_i24", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
defm V_MAD_U32_U24 : VOP3Inst <"v_mad_u32_u24", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
defm V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, any_fma>, VOPD_Component<0x13, "v_fma_f32">;
-defm V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_lerp>;
+let SubtargetPredicate = HasLerpInst in
+ defm V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_lerp>;
let SchedRW = [WriteIntMul] in {
let SubtargetPredicate = HasMadU32Inst in
@@ -258,12 +259,12 @@ defm V_DIV_FMAS_F64 : VOP3Inst <"v_div_fmas_f64", VOP_F64_F64_F64_F64_VCC>;
} // End isCommutable = 1
let isReMaterializable = 1 in {
-let mayRaiseFPException = 0 in {
+let mayRaiseFPException = 0, SubtargetPredicate = HasCubeInsts in {
defm V_CUBEID_F32 : VOP3Inst <"v_cubeid_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubeid>;
defm V_CUBESC_F32 : VOP3Inst <"v_cubesc_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubesc>;
defm V_CUBETC_F32 : VOP3Inst <"v_cubetc_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubetc>;
defm V_CUBEMA_F32 : VOP3Inst <"v_cubema_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubema>;
-} // End mayRaiseFPException
+} // mayRaiseFPException = 0, SubtargetPredicate = HasCubeInsts
defm V_BFE_U32 : VOP3Inst <"v_bfe_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfe_u32>;
defm V_BFE_I32 : VOP3Inst <"v_bfe_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfe_i32>;
@@ -306,12 +307,12 @@ let SubtargetPredicate = HasMinimum3Maximum3F32, ReadsModeReg = 0 in {
defm V_MAXIMUM3_F32 : VOP3Inst <"v_maximum3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmaximum3>;
} // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0
-let isCommutable = 1 in {
+let isCommutable = 1, SubtargetPredicate = HasSadInsts in {
defm V_SAD_U8 : VOP3Inst <"v_sad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
defm V_SAD_HI_U8 : VOP3Inst <"v_sad_hi_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
defm V_SAD_U16 : VOP3Inst <"v_sad_u16", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
defm V_SAD_U32 : VOP3Inst <"v_sad_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
-} // End isCommutable = 1
+} // End isCommutable = 1, SubtargetPredicate = HasSadInsts
defm V_CVT_PK_U8_F32 : VOP3Inst<"v_cvt_pk_u8_f32", VOP3_Profile<VOP_I32_F32_I32_I32>, int_amdgcn_cvt_pk_u8_f32>;
defm V_DIV_FIXUP_F32 : VOP3Inst <"v_div_fixup_f32", DIV_FIXUP_F32_PROF, AMDGPUdiv_fixup>;
@@ -424,7 +425,8 @@ def VOPProfileMQSAD : VOP3_Profile<VOP_V4I32_I64_I32_V4I32, VOP3_CLAMP> {
let SubtargetPredicate = isGFX7Plus in {
let Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] in {
-defm V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>;
+let SubtargetPredicate = HasQsadInsts in
+ defm V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>;
defm V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOPProfileMQSAD>;
} // End Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32]
} // End SubtargetPredicate = isGFX7Plus
@@ -789,9 +791,6 @@ let isCommutable = 1 in {
defm V_MAD_I32_I16 : VOP3Inst_t16 <"v_mad_i32_i16", VOP_I32_I16_I16_I32>;
} // End isCommutable = 1
-defm V_CVT_PKNORM_I16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_i16_f16", VOP_B32_F16_F16>;
-defm V_CVT_PKNORM_U16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_u16_f16", VOP_B32_F16_F16>;
-
defm V_PACK_B32_F16 : VOP3Inst_t16 <"v_pack_b32_f16", VOP_B32_F16_F16>;
let isReMaterializable = 1 in {
@@ -996,6 +995,11 @@ def : GCNPat<(DivergentBinFrag<or> (or_oneuse i64:$src0, i64:$src1), i64:$src2),
} // End SubtargetPredicate = isGFX9Plus
+let SubtargetPredicate = HasCvtPkNormVOP3Insts in {
+ defm V_CVT_PKNORM_I16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_i16_f16", VOP_B32_F16_F16>;
+ defm V_CVT_PKNORM_U16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_u16_f16", VOP_B32_F16_F16>;
+} // end SubtargetPredicate = HasCvtPkNormVOP3Insts
+
// FIXME: Probably should hardcode clamp bit in pseudo and avoid this.
class OpSelBinOpClampPat<SDPatternOperator node,
Instruction inst> : GCNPat<
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 4ae2c1e..2dfa905 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -1364,16 +1364,10 @@ class WMMAOpcodeMapping<Instruction TwoAddr, Instruction ThreeAddr> {
field bit is_wmma_xdl;
}
-def WMMAOpcode : GenericEnum {
- let FilterClass = "VOP3P_Pseudo";
-}
-
class WMMAMappingTable : GenericTable {
let FilterClass = "WMMAOpcodeMapping";
let CppTypeName = "WMMAOpcodeMappingInfo";
let Fields = ["Opcode2Addr", "Opcode3Addr"];
- string TypeOf_Opcode2Addr = "WMMAOpcode";
- string TypeOf_Opcode3Addr = "WMMAOpcode";
}
def WMMAOpcode2AddrMappingTable : WMMAMappingTable {
@@ -1707,7 +1701,7 @@ multiclass WMMAInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string Pse
defvar WMMAConstraints2Addr = !if(DiffVdstSrc2, "@earlyclobber $vdst", "@earlyclobber $vdst,$vdst = $src2");
defvar WMMAConstraints3Addr = "@earlyclobber $vdst";
- let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
+ let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0, isConvergent = 1 in {
let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in
def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>, WMMAInstInfo {
let PseudoInstr = Instr#PseudoInstrSuffix;
@@ -1734,7 +1728,7 @@ multiclass SWMMACInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string P
let mayRaiseFPException = 0;
let ReadsModeReg = 0;
let AsmMatchConverter = "cvtSWMMAC";
-
+ let isConvergent = 1;
let Constraints = "@earlyclobber $vdst,$vdst = $srcTiedDef";
}
}
@@ -1906,8 +1900,10 @@ defm V_WMMA_SCALE_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_scale_f32_32x16
defm V_WMMA_SCALE16_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_scale16_f32_32x16x128_f4", F32_32X16X128_F4_SCALE16_w32, "_w32">;
} // End is_wmma_xdl = 1.
-defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3PInst<"v_wmma_ld_scale_paired_b32", VOP_WMMA_LD_SCALE<i32, VCSrc_b32_Lo256>>;
-defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3PInst<"v_wmma_ld_scale16_paired_b64", VOP_WMMA_LD_SCALE<i64, VCSrc_b64_Lo256>>;
+let isConvergent = 1 in {
+ defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3PInst<"v_wmma_ld_scale_paired_b32", VOP_WMMA_LD_SCALE<i32, VCSrc_b32_Lo256>>;
+ defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3PInst<"v_wmma_ld_scale16_paired_b64", VOP_WMMA_LD_SCALE<i64, VCSrc_b64_Lo256>>;
+}
} // End SubtargetPredicate = isGFX125xOnly
} // End WaveSizePredicate = isWave32
@@ -2216,7 +2212,7 @@ class VOP3PX2e <bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile P> : Enc128, VO
let Inst{23-16} = LdScaleOp;
let Inst{40-32} = scale_src0;
let Inst{49-41} = scale_src1;
- let Inst{58-50} = 0; // scale src2
+ let Inst{58-50} = 0x100; // scale src2 = vgpr0 (dummy)
let Inst{59} = matrix_b_scale{0}; // scale_op_sel_hi(0)
let Inst{60} = 0; // scale_op_sel_hi(1)
let Inst{63-61} = {0, matrix_a_scale_fmt{1-0}}; // neg (lo)
@@ -2431,6 +2427,15 @@ multiclass VOP3P_Real_with_name_gfx12<bits<8> op,
string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> :
VOP3P_Real_with_name<GFX12Gen, op, backing_ps_name, asmName>;
+multiclass VOP3P_Real_LD_SCALE_gfx1250<bits<8> op> {
+ defvar ps = !cast<VOP3P_Pseudo>(NAME);
+ def _gfx1250 :
+ VOP3P_Real_Gen<ps, GFX1250Gen, ps.Mnemonic>,
+ VOP3Pe_gfx11_gfx12<op, ps.Pfl> {
+ let Inst{58-50} = 0x100; // scale src2 = vgpr0 (dummy)
+ }
+}
+
defm V_PK_MIN_NUM_F16 : VOP3P_Real_with_name_gfx12<0x1b, "V_PK_MIN_F16", "v_pk_min_num_f16">;
defm V_PK_MAX_NUM_F16 : VOP3P_Real_with_name_gfx12<0x1c, "V_PK_MAX_F16", "v_pk_max_num_f16">;
@@ -2460,8 +2465,8 @@ defm V_FMA_MIX_F32_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3d>;
defm V_FMA_MIXLO_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3e>;
defm V_FMA_MIXHI_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3f>;
-defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3P_Real_gfx1250<0x35>;
-defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3P_Real_gfx1250<0x3a>;
+defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3P_Real_LD_SCALE_gfx1250<0x35>;
+defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3P_Real_LD_SCALE_gfx1250<0x3a>;
let AssemblerPredicate = isGFX1250Plus in
def : AMDGPUMnemonicAlias<"v_fma_mix_f32_f16", "v_fma_mix_f32">;
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index 2730ec5..a829b80 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -1233,18 +1233,12 @@ defm V_CMPX_CLASS_F16 : VOPCX_CLASS_F16 <"v_cmpx_class_f16">;
// We need to use COPY_TO_REGCLASS to w/a the problem when ReplaceAllUsesWith()
// complaints it cannot replace i1 <-> i64/i32 if node was not morphed in place.
multiclass ICMP_Pattern <PatFrags cond, Instruction inst, ValueType vt, dag dstInst = (inst $src0, $src1)> {
- let WaveSizePredicate = isWave64 in
def : GCNPat <
- (i64 (AMDGPUsetcc vt:$src0, vt:$src1, cond)),
- (i64 (COPY_TO_REGCLASS dstInst, SReg_64))
+ (WaveSizeVT (AMDGPUsetcc vt:$src0, vt:$src1, cond)),
+ dstInst
>;
let WaveSizePredicate = isWave32 in {
- def : GCNPat <
- (i32 (AMDGPUsetcc vt:$src0, vt:$src1, cond)),
- (i32 (COPY_TO_REGCLASS dstInst, SReg_32))
- >;
-
// Support codegen of i64 setcc in wave32 mode.
def : GCNPat <
(i64 (AMDGPUsetcc vt:$src0, vt:$src1, cond)),
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 8325c62..ea3edb8 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -1357,8 +1357,12 @@ class VOPBinOpClampPat<SDPatternOperator node, Instruction inst, ValueType vt> :
class getVOP3ModPat<VOPProfile P, SDPatternOperator node> {
dag src0 = !if(P.HasOMod,
- (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
- (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp));
+ !if(P.HasClamp,
+ (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
+ (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i32:$omod)),
+ !if(P.HasClamp,
+ (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp),
+ (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers)));
list<dag> ret3 = [(set P.DstVT:$vdst,
(DivergentFragOrOp<node, P>.ret (P.Src0VT src0),
@@ -2204,12 +2208,12 @@ include "VOP3PInstructions.td"
include "VOPDInstructions.td"
class ClassPat<Instruction inst, ValueType vt> : GCNPat <
- (is_fpclass (vt (VOP3ModsNonCanonicalizing vt:$src0, i32:$src0_mods)), (i32 timm:$mask)),
+ (i1 (is_fpclass (vt (VOP3ModsNonCanonicalizing vt:$src0, i32:$src0_mods)), (i32 timm:$mask))),
(inst i32:$src0_mods, vt:$src0, (V_MOV_B32_e32 timm:$mask))
>;
class ClassPat_t16<Instruction inst, ValueType vt> : GCNPat <
- (is_fpclass (vt (VOP3ModsNonCanonicalizing vt:$src0, i32:$src0_mods)), (i32 timm:$mask)),
+ (i1 (is_fpclass (vt (VOP3ModsNonCanonicalizing vt:$src0, i32:$src0_mods)), (i32 timm:$mask))),
(inst i32:$src0_mods, vt:$src0, SRCMODS.NONE, (V_MOV_B32_e32 timm:$mask))
>;