aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.h23
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.td1854
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp26
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h103
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp10
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp72
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp74
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp120
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h24
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp12
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td11
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp325
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCombine.td17
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUFeatures.td4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGISel.td9
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h6
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUHazardLatency.cpp77
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUHazardLatency.h24
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp73
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp225
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h18
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp495
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h239
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp21
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp145
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp14
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td46
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp498
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructions.td46
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULaneMaskUtils.h4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp33
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp335
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h5
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp125
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp47
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp240
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp151
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp99
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp159
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp291
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp35
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp133
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h24
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp16
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h6
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp25
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def9
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp11
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp21
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp863
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp148
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp477
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h41
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp582
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h46
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp135
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp12
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp29
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td12
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.cpp48
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.h40
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp12
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp14
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h120
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp12
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp533
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp273
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h21
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp108
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp90
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp27
-rw-r--r--llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp182
-rw-r--r--llvm/lib/Target/AMDGPU/BUFInstructions.td359
-rw-r--r--llvm/lib/Target/AMDGPU/CMakeLists.txt8
-rw-r--r--llvm/lib/Target/AMDGPU/DSInstructions.td69
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp64
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h5
-rw-r--r--llvm/lib/Target/AMDGPU/EvergreenInstructions.td1
-rw-r--r--llvm/lib/Target/AMDGPU/FLATInstructions.td237
-rw-r--r--llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp26
-rw-r--r--llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp405
-rw-r--r--llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h9
-rw-r--r--llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/GCNPreRALongBranchReg.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp18
-rw-r--r--llvm/lib/Target/AMDGPU/GCNProcessors.td12
-rw-r--r--llvm/lib/Target/AMDGPU/GCNRegPressure.cpp180
-rw-r--r--llvm/lib/Target/AMDGPU/GCNRegPressure.h48
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp1721
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.h372
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.cpp29
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.h1162
-rw-r--r--llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp58
-rw-r--r--llvm/lib/Target/AMDGPU/InstCombineTables.td10
-rw-r--r--llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h8
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp183
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h5
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp65
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp34
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h7
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp15
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h6
-rw-r--r--llvm/lib/Target/AMDGPU/MIMGInstructions.td199
-rw-r--r--llvm/lib/Target/AMDGPU/R600.td21
-rw-r--r--llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/R600ISelLowering.cpp34
-rw-r--r--llvm/lib/Target/AMDGPU/R600ISelLowering.h2
-rw-r--r--llvm/lib/Target/AMDGPU/R600InstrInfo.cpp10
-rw-r--r--llvm/lib/Target/AMDGPU/R600InstrInfo.h2
-rw-r--r--llvm/lib/Target/AMDGPU/R600Instructions.td14
-rw-r--r--llvm/lib/Target/AMDGPU/R600MCInstLower.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/R600Processors.td20
-rw-r--r--llvm/lib/Target/AMDGPU/R600Subtarget.h25
-rw-r--r--llvm/lib/Target/AMDGPU/R600TargetMachine.cpp12
-rw-r--r--llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp15
-rw-r--r--llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h9
-rw-r--r--llvm/lib/Target/AMDGPU/SIDefines.h26
-rw-r--r--llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp57
-rw-r--r--llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/SIFoldOperands.cpp158
-rw-r--r--llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp14
-rw-r--r--llvm/lib/Target/AMDGPU/SIFrameLowering.cpp72
-rw-r--r--llvm/lib/Target/AMDGPU/SIFrameLowering.h8
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp1242
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.h16
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp2525
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp1185
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h300
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.td152
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td299
-rw-r--r--llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp35
-rw-r--r--llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp246
-rw-r--r--llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp14
-rw-r--r--llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp9
-rw-r--r--llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp35
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h13
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp65
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineScheduler.h14
-rw-r--r--llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp1488
-rw-r--r--llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp16
-rw-r--r--llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp7
-rw-r--r--llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp28
-rw-r--r--llvm/lib/Target/AMDGPU/SIPostRABundler.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp90
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp257
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.h40
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.td273
-rw-r--r--llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp144
-rw-r--r--llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp76
-rw-r--r--llvm/lib/Target/AMDGPU/SMInstructions.td83
-rw-r--r--llvm/lib/Target/AMDGPU/SOPInstructions.td1245
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp31
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h19
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp293
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h221
-rw-r--r--llvm/lib/Target/AMDGPU/VINTERPInstructions.td18
-rw-r--r--llvm/lib/Target/AMDGPU/VOP1Instructions.td472
-rw-r--r--llvm/lib/Target/AMDGPU/VOP2Instructions.td39
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3Instructions.td114
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3PInstructions.td386
-rw-r--r--llvm/lib/Target/AMDGPU/VOPCInstructions.td59
-rw-r--r--llvm/lib/Target/AMDGPU/VOPInstructions.td127
187 files changed, 17235 insertions, 10575 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index ce2b4a5..5df11a4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -69,7 +69,7 @@ FunctionPass *createAMDGPUPreloadKernArgPrologLegacyPass();
ModulePass *createAMDGPUPreloadKernelArgumentsLegacyPass(const TargetMachine *);
struct AMDGPUSimplifyLibCallsPass : PassInfoMixin<AMDGPUSimplifyLibCallsPass> {
- AMDGPUSimplifyLibCallsPass() {}
+ AMDGPUSimplifyLibCallsPass() = default;
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
};
@@ -298,6 +298,15 @@ private:
bool GlobalOpt;
};
+void initializeAMDGPULowerExecSyncLegacyPass(PassRegistry &);
+extern char &AMDGPULowerExecSyncLegacyPassID;
+ModulePass *createAMDGPULowerExecSyncLegacyPass();
+
+struct AMDGPULowerExecSyncPass : PassInfoMixin<AMDGPULowerExecSyncPass> {
+ AMDGPULowerExecSyncPass() {}
+ PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
void initializeAMDGPUSwLowerLDSLegacyPass(PassRegistry &);
extern char &AMDGPUSwLowerLDSLegacyPassID;
ModulePass *
@@ -371,13 +380,13 @@ public:
class AMDGPUAnnotateUniformValuesPass
: public PassInfoMixin<AMDGPUAnnotateUniformValuesPass> {
public:
- AMDGPUAnnotateUniformValuesPass() {}
+ AMDGPUAnnotateUniformValuesPass() = default;
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
};
class SIModeRegisterPass : public PassInfoMixin<SIModeRegisterPass> {
public:
- SIModeRegisterPass() {}
+ SIModeRegisterPass() = default;
PreservedAnalyses run(MachineFunction &F, MachineFunctionAnalysisManager &AM);
};
@@ -527,7 +536,7 @@ void initializeAMDGPUAAWrapperPassPass(PassRegistry&);
ImmutablePass *createAMDGPUExternalAAWrapperPass();
void initializeAMDGPUExternalAAWrapperPass(PassRegistry&);
-void initializeAMDGPUArgumentUsageInfoPass(PassRegistry &);
+void initializeAMDGPUArgumentUsageInfoWrapperLegacyPass(PassRegistry &);
ModulePass *createAMDGPUExportKernelRuntimeHandlesLegacyPass();
void initializeAMDGPUExportKernelRuntimeHandlesLegacyPass(PassRegistry &);
@@ -562,9 +571,13 @@ public:
void initializeAMDGPURewriteAGPRCopyMFMALegacyPass(PassRegistry &);
extern char &AMDGPURewriteAGPRCopyMFMALegacyID;
+void initializeAMDGPUUniformIntrinsicCombineLegacyPass(PassRegistry &);
+extern char &AMDGPUUniformIntrinsicCombineLegacyPassID;
+FunctionPass *createAMDGPUUniformIntrinsicCombineLegacyPass();
+
struct AMDGPUUniformIntrinsicCombinePass
: public PassInfoMixin<AMDGPUUniformIntrinsicCombinePass> {
- PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+ PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
};
namespace AMDGPU {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index ea32748..9ad2f2e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -19,69 +19,105 @@ def p4 : PtrValueType<i64, 4>;
def p5 : PtrValueType<i32, 5>;
def p6 : PtrValueType<i32, 6>;
-//===------------------------------------------------------------===//
-// Subtarget Features (device properties)
-//===------------------------------------------------------------===//
+//===-----------------------------------------------------------------------===//
+// AMDGPU Subtarget Feature (device properties)
+//===----------------------------------------------------------------------===//
-def FeatureFastFMAF32 : SubtargetFeature<"fast-fmaf",
- "FastFMAF32",
- "true",
- "Assuming f32 fma is at least as fast as mul + add"
+// Multiclass to define a SubtargetFeature along with optional predicates.
+// Parameters:
+// - FeatureString: The feature string used in the SubtargetFeature.
+// - Description: The description of the feature.
+// - GenPredicate: If 1 (default), generates a Has#NAME predicate.
+// - GenAssemblerPredicate: If 1 (default), the predicate includes AssemblerPredicate.
+// - Deps: List of dependent SubtargetFeatures (default empty).
+//
+// Usage:
+// defm MadMixInsts : AMDGPUSubtargetFeature<"mad-mix-insts", "description">;
+// This generates:
+// - FeatureMadMixInsts : SubtargetFeature<"mad-mix-insts", "HasMadMixInsts", "true", "description">
+// - HasMadMixInsts : Predicate<"Subtarget->hasMadMixInsts()">,
+// AssemblerPredicate<(any_of FeatureMadMixInsts)>
+//
+// With GenAssemblerPredicate=0:
+// defm Foo : AMDGPUSubtargetFeature<"foo", "desc", 1, 0>;
+// This generates:
+// - FeatureFoo : SubtargetFeature<...>
+// - HasFoo : Predicate<"Subtarget->hasFoo()"> (no AssemblerPredicate)
+//
+// With dependencies:
+// defm Bar : AMDGPUSubtargetFeature<"bar", "desc", 1, 1, [FeatureFoo]>;
+// This generates:
+// - FeatureBar : SubtargetFeature<"bar", "HasBar", "true", "desc", [FeatureFoo]>
+// - HasBar : Predicate + AssemblerPredicate
+multiclass AMDGPUSubtargetFeature<string FeatureString,
+ string Description,
+ bit GenPredicate = 1,
+ bit GenAssemblerPredicate = 1,
+ list<SubtargetFeature> Deps = []> {
+ def Feature#NAME : SubtargetFeature<FeatureString,
+ "Has"#NAME,
+ "true",
+ Description,
+ Deps
+ >;
+
+ if GenPredicate then
+ if GenAssemblerPredicate then
+ def Has#NAME
+ : Predicate<"Subtarget->has"#NAME#"()">,
+ AssemblerPredicate<(any_of !cast<SubtargetFeature>("Feature"#NAME))>;
+ else
+ def Has#NAME : Predicate<"Subtarget->has"#NAME#"()">;
+}
+
+defm FastFMAF32 : AMDGPUSubtargetFeature<"fast-fmaf",
+ "Assuming f32 fma is at least as fast as mul + add",
+ /*GenPredicate=*/0
>;
-def FeatureFastDenormalF32 : SubtargetFeature<"fast-denormal-f32",
- "FastDenormalF32",
- "true",
- "Enabling denormals does not cause f32 instructions to run at f64 rates"
+defm FastDenormalF32 : AMDGPUSubtargetFeature<"fast-denormal-f32",
+ "Enabling denormals does not cause f32 instructions to run at f64 rates",
+ /*GenPredicate=*/0
>;
-def FeatureMIMG_R128 : SubtargetFeature<"mimg-r128",
- "MIMG_R128",
- "true",
- "Support 128-bit texture resources"
+defm MIMG_R128 : AMDGPUSubtargetFeature<"mimg-r128",
+ "Support 128-bit texture resources",
+ /*GenPredicate=*/0
>;
-def HalfRate64Ops : SubtargetFeature<"half-rate-64-ops",
- "HalfRate64Ops",
- "true",
- "Most fp64 instructions are half rate instead of quarter"
+defm HalfRate64Ops : AMDGPUSubtargetFeature<"half-rate-64-ops",
+ "Most fp64 instructions are half rate instead of quarter",
+ /*GenPredicate=*/0
>;
-def FullRate64Ops : SubtargetFeature<"full-rate-64-ops",
- "FullRate64Ops",
- "true",
- "Most fp64 instructions are full rate"
+defm FullRate64Ops : AMDGPUSubtargetFeature<"full-rate-64-ops",
+ "Most fp64 instructions are full rate",
+ /*GenPredicate=*/0
>;
-def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space",
- "FlatAddressSpace",
- "true",
+defm FlatAddressSpace : AMDGPUSubtargetFeature<"flat-address-space",
"Support flat address space"
>;
-def FeatureFlatInstOffsets : SubtargetFeature<"flat-inst-offsets",
- "FlatInstOffsets",
- "true",
+defm FlatInstOffsets : AMDGPUSubtargetFeature<"flat-inst-offsets",
"Flat instructions have immediate offset addressing mode"
>;
-def FeatureFlatGlobalInsts : SubtargetFeature<"flat-global-insts",
- "FlatGlobalInsts",
- "true",
+defm FlatGlobalInsts : AMDGPUSubtargetFeature<"flat-global-insts",
"Have global_* flat memory instructions",
+ /*GenPredicate=*/1,
+ /*GenAssemblerPredicate=*/1,
[FeatureFlatAddressSpace]
>;
-def FeatureFlatScratchInsts : SubtargetFeature<"flat-scratch-insts",
- "FlatScratchInsts",
- "true",
+defm FlatScratchInsts : AMDGPUSubtargetFeature<"flat-scratch-insts",
"Have scratch_* flat memory instructions",
+ /*GenPredicate=*/1,
+ /*GenAssemblerPredicate=*/1,
[FeatureFlatAddressSpace]
>;
-def FeatureScalarFlatScratchInsts : SubtargetFeature<"scalar-flat-scratch-insts",
- "ScalarFlatScratchInsts",
- "true",
+defm ScalarFlatScratchInsts : AMDGPUSubtargetFeature<"scalar-flat-scratch-insts",
"Have s_scratch_* flat memory instructions"
>;
@@ -91,100 +127,74 @@ def FeatureEnableFlatScratch : SubtargetFeature<"enable-flat-scratch",
"Use scratch_* flat memory instructions to access scratch"
>;
-def FeatureFlatGVSMode : SubtargetFeature<"flat-gvs-mode",
- "FlatGVSMode",
- "true",
+defm FlatGVSMode : AMDGPUSubtargetFeature<"flat-gvs-mode",
"Have GVS addressing mode with flat_* instructions",
+ /*GenPredicate=*/1,
+ /*GenAssemblerPredicate=*/1,
[FeatureFlatAddressSpace]
>;
-def FeatureAddNoCarryInsts : SubtargetFeature<"add-no-carry-insts",
- "AddNoCarryInsts",
- "true",
+defm AddNoCarryInsts : AMDGPUSubtargetFeature<"add-no-carry-insts",
"Have VALU add/sub instructions without carry out"
>;
-def FeatureUnalignedBufferAccess : SubtargetFeature<"unaligned-buffer-access",
- "UnalignedBufferAccess",
- "true",
+defm UnalignedBufferAccess : AMDGPUSubtargetFeature<"unaligned-buffer-access",
"Hardware supports unaligned global loads and stores"
>;
-def FeatureTrapHandler: SubtargetFeature<"trap-handler",
- "TrapHandler",
- "true",
- "Trap handler support"
+defm TrapHandler: AMDGPUSubtargetFeature<"trap-handler",
+ "Trap handler support",
+ /*GenPredicate=*/0
>;
-def FeatureUnalignedScratchAccess : SubtargetFeature<"unaligned-scratch-access",
- "UnalignedScratchAccess",
- "true",
+defm UnalignedScratchAccess : AMDGPUSubtargetFeature<"unaligned-scratch-access",
"Support unaligned scratch loads and stores"
>;
-def FeatureUnalignedDSAccess : SubtargetFeature<"unaligned-ds-access",
- "UnalignedDSAccess",
- "true",
+defm UnalignedDSAccess : AMDGPUSubtargetFeature<"unaligned-ds-access",
"Hardware supports unaligned local and region loads and stores"
>;
-def FeatureRelaxedBufferOOBMode : SubtargetFeature<"relaxed-buffer-oob-mode",
- "RelaxedBufferOOBMode",
- "true",
- "Disable strict out-of-bounds buffer guarantees. An OOB access may potentially cause an adjacent access to be treated as if it were also OOB"
+defm RelaxedBufferOOBMode : AMDGPUSubtargetFeature<"relaxed-buffer-oob-mode",
+ "Disable strict out-of-bounds buffer guarantees. An OOB access may potentially"
+ "cause an adjacent access to be treated as if it were also OOB"
>;
-def FeatureApertureRegs : SubtargetFeature<"aperture-regs",
- "HasApertureRegs",
- "true",
- "Has Memory Aperture Base and Size Registers"
+defm ApertureRegs : AMDGPUSubtargetFeature<"aperture-regs",
+ "Has Memory Aperture Base and Size Registers",
+ /*GenPredicate=*/0
>;
-def FeatureMadMixInsts : SubtargetFeature<"mad-mix-insts",
- "HasMadMixInsts",
- "true",
+defm MadMixInsts : AMDGPUSubtargetFeature<"mad-mix-insts",
"Has v_mad_mix_f32, v_mad_mixlo_f16, v_mad_mixhi_f16 instructions"
>;
-def FeatureFmaMixInsts : SubtargetFeature<"fma-mix-insts",
- "HasFmaMixInsts",
- "true",
+defm FmaMixInsts : AMDGPUSubtargetFeature<"fma-mix-insts",
"Has v_fma_mix_f32, v_fma_mixlo_f16, v_fma_mixhi_f16 instructions"
>;
-def FeatureFmaMixBF16Insts : SubtargetFeature<"fma-mix-bf16-insts",
- "HasFmaMixBF16Insts",
- "true",
+defm FmaMixBF16Insts : AMDGPUSubtargetFeature<"fma-mix-bf16-insts",
"Has v_fma_mix_f32_bf16, v_fma_mixlo_bf16, v_fma_mixhi_bf16 instructions"
>;
-def FeatureIEEEMinimumMaximumInsts : SubtargetFeature<"ieee-minimum-maximum-insts",
- "HasIEEEMinimumMaximumInsts",
- "true",
- "Has v_minimum/maximum_f16/f32/f64, v_minimummaximum/maximumminimum_f16/f32 and v_pk_minimum/maximum_f16 instructions"
+defm IEEEMinimumMaximumInsts : AMDGPUSubtargetFeature<"ieee-minimum-maximum-insts",
+ "Has v_minimum/maximum_f16/f32/f64, v_minimummaximum/maximumminimum_f16/f32 and"
+ "v_pk_minimum/maximum_f16 instructions"
>;
-def FeatureMinimum3Maximum3F32 : SubtargetFeature<"minimum3-maximum3-f32",
- "HasMinimum3Maximum3F32",
- "true",
+defm Minimum3Maximum3F32 : AMDGPUSubtargetFeature<"minimum3-maximum3-f32",
"Has v_minimum3_f32 and v_maximum3_f32 instructions"
>;
-def FeatureMinimum3Maximum3F16 : SubtargetFeature<"minimum3-maximum3-f16",
- "HasMinimum3Maximum3F16",
- "true",
+defm Minimum3Maximum3F16 : AMDGPUSubtargetFeature<"minimum3-maximum3-f16",
"Has v_minimum3_f16 and v_maximum3_f16 instructions"
>;
-def FeatureMin3Max3PKF16 : SubtargetFeature<"min3-max3-pkf16",
- "HasMin3Max3PKF16",
- "true",
+defm Min3Max3PKF16 : AMDGPUSubtargetFeature<"min3-max3-pkf16",
"Has v_pk_min3_num_f16 and v_pk_max3_num_f16 instructions"
>;
-def FeatureMinimum3Maximum3PKF16 : SubtargetFeature<"minimum3-maximum3-pkf16",
- "HasMinimum3Maximum3PKF16",
- "true",
+defm Minimum3Maximum3PKF16 : AMDGPUSubtargetFeature<"minimum3-maximum3-pkf16",
"Has v_pk_minimum3_f16 and v_pk_maximum3_f16 instructions"
>;
@@ -223,82 +233,67 @@ def FeaturePreciseMemory
: SubtargetFeature<"precise-memory", "EnablePreciseMemory",
"true", "Enable precise memory mode">;
-def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug",
- "SGPRInitBug",
- "true",
+defm SGPRInitBug : AMDGPUSubtargetFeature<"sgpr-init-bug",
"VI SGPR initialization bug requiring a fixed SGPR allocation size"
>;
-def FeatureUserSGPRInit16Bug : SubtargetFeature<"user-sgpr-init16-bug",
- "UserSGPRInit16Bug",
- "true",
- "Bug requiring at least 16 user+system SGPRs to be enabled"
+defm UserSGPRInit16Bug : AMDGPUSubtargetFeature<"user-sgpr-init16-bug",
+ "Bug requiring at least 16 user+system SGPRs to be enabled",
+ /*GenPredicate=*/0
>;
-def FeatureLdsMisalignedBug : SubtargetFeature<"lds-misaligned-bug",
- "LDSMisalignedBug",
- "true",
- "Some GFX10 bug with multi-dword LDS and flat access that is not naturally aligned in WGP mode"
+defm LDSMisalignedBug : AMDGPUSubtargetFeature<"lds-misaligned-bug",
+ "Some GFX10 bug with multi-dword LDS and flat access that is not naturally aligned in WGP mode",
+ /*GenPredicate=*/0
>;
-def FeatureMFMAInlineLiteralBug : SubtargetFeature<"mfma-inline-literal-bug",
- "HasMFMAInlineLiteralBug",
- "true",
- "MFMA cannot use inline literal as SrcC"
+defm MFMAInlineLiteralBug : AMDGPUSubtargetFeature<"mfma-inline-literal-bug",
+ "MFMA cannot use inline literal as SrcC",
+ /*GenPredicate=*/0
>;
-def FeatureVcmpxPermlaneHazard : SubtargetFeature<"vcmpx-permlane-hazard",
- "HasVcmpxPermlaneHazard",
- "true",
- "TODO: describe me"
+defm VcmpxPermlaneHazard : AMDGPUSubtargetFeature<"vcmpx-permlane-hazard",
+ "TODO: describe me",
+ /*GenPredicate=*/0
>;
-def FeatureVMEMtoScalarWriteHazard : SubtargetFeature<"vmem-to-scalar-write-hazard",
- "HasVMEMtoScalarWriteHazard",
- "true",
- "VMEM instruction followed by scalar writing to EXEC mask, M0 or SGPR leads to incorrect execution."
+defm VMEMtoScalarWriteHazard : AMDGPUSubtargetFeature<"vmem-to-scalar-write-hazard",
+ "VMEM instruction followed by scalar writing to EXEC mask, M0 or SGPR leads to incorrect execution.",
+ /*GenPredicate=*/0
>;
-def FeatureSMEMtoVectorWriteHazard : SubtargetFeature<"smem-to-vector-write-hazard",
- "HasSMEMtoVectorWriteHazard",
- "true",
- "s_load_dword followed by v_cmp page faults"
+defm SMEMtoVectorWriteHazard : AMDGPUSubtargetFeature<"smem-to-vector-write-hazard",
+ "s_load_dword followed by v_cmp page faults",
+ /*GenPredicate=*/0
>;
-def FeatureInstFwdPrefetchBug : SubtargetFeature<"inst-fwd-prefetch-bug",
- "HasInstFwdPrefetchBug",
- "true",
- "S_INST_PREFETCH instruction causes shader to hang"
+defm InstFwdPrefetchBug : AMDGPUSubtargetFeature<"inst-fwd-prefetch-bug",
+ "S_INST_PREFETCH instruction causes shader to hang",
+ /*GenPredicate=*/0
>;
-def FeatureVmemPrefInsts : SubtargetFeature<"vmem-pref-insts",
- "HasVmemPrefInsts",
- "true",
+defm VmemPrefInsts : AMDGPUSubtargetFeature<"vmem-pref-insts",
"Has flat_prefect_b8 and global_prefetch_b8 instructions"
>;
-def FeatureSafeSmemPrefetch : SubtargetFeature<"safe-smem-prefetch",
- "HasSafeSmemPrefetch",
- "true",
- "SMEM prefetches do not fail on illegal address"
+defm SafeSmemPrefetch : AMDGPUSubtargetFeature<"safe-smem-prefetch",
+ "SMEM prefetches do not fail on illegal address",
+ /*GenPredicate=*/0
>;
-def FeatureSafeCUPrefetch : SubtargetFeature<"safe-cu-prefetch",
- "HasSafeCUPrefetch",
- "true",
- "VMEM CU scope prefetches do not fail on illegal address"
+defm SafeCUPrefetch : AMDGPUSubtargetFeature<"safe-cu-prefetch",
+ "VMEM CU scope prefetches do not fail on illegal address",
+ /*GenPredicate=*/0
>;
-def FeatureVcmpxExecWARHazard : SubtargetFeature<"vcmpx-exec-war-hazard",
- "HasVcmpxExecWARHazard",
- "true",
- "V_CMPX WAR hazard on EXEC (V_CMPX issue ONLY)"
+defm VcmpxExecWARHazard : AMDGPUSubtargetFeature<"vcmpx-exec-war-hazard",
+ "V_CMPX WAR hazard on EXEC (V_CMPX issue ONLY)",
+ /*GenPredicate=*/0
>;
-def FeatureLdsBranchVmemWARHazard : SubtargetFeature<"lds-branch-vmem-war-hazard",
- "HasLdsBranchVmemWARHazard",
- "true",
- "Switching between LDS and VMEM-tex not waiting VM_VSRC=0"
+defm LdsBranchVmemWARHazard : AMDGPUSubtargetFeature<"lds-branch-vmem-war-hazard",
+ "Switching between LDS and VMEM-tex not waiting VM_VSRC=0",
+ /*GenPredicate=*/0
>;
class FeatureMaxHardClauseLength<int size> : SubtargetFeature<
@@ -316,70 +311,60 @@ def FeatureMaxHardClauseLength32 : FeatureMaxHardClauseLength<32>;
/// permitted clause length.
def FeatureMaxHardClauseLength63 : FeatureMaxHardClauseLength<63>;
-def FeatureNSAtoVMEMBug : SubtargetFeature<"nsa-to-vmem-bug",
- "HasNSAtoVMEMBug",
- "true",
- "MIMG-NSA followed by VMEM fail if EXEC_LO or EXEC_HI equals zero"
+defm NSAtoVMEMBug : AMDGPUSubtargetFeature<"nsa-to-vmem-bug",
+ "MIMG-NSA followed by VMEM fail if EXEC_LO or EXEC_HI equals zero",
+ /*GenPredicate=*/0
>;
-def FeatureNSAClauseBug : SubtargetFeature<"nsa-clause-bug",
- "HasNSAClauseBug",
- "true",
- "MIMG-NSA in a hard clause has unpredictable results on GFX10.1"
+defm NSAClauseBug : AMDGPUSubtargetFeature<"nsa-clause-bug",
+ "MIMG-NSA in a hard clause has unpredictable results on GFX10.1",
+ /*GenPredicate=*/0
>;
-def FeatureFlatSegmentOffsetBug : SubtargetFeature<"flat-segment-offset-bug",
- "HasFlatSegmentOffsetBug",
- "true",
- "GFX10 bug where inst_offset is ignored when flat instructions access global memory"
+defm FlatSegmentOffsetBug : AMDGPUSubtargetFeature<"flat-segment-offset-bug",
+ "GFX10 bug where inst_offset is ignored when flat instructions access global memory",
+ /*GenPredicate=*/0
>;
-def FeatureNegativeScratchOffsetBug : SubtargetFeature<"negative-scratch-offset-bug",
- "NegativeScratchOffsetBug",
- "true",
+defm NegativeScratchOffsetBug : AMDGPUSubtargetFeature<"negative-scratch-offset-bug",
"Negative immediate offsets in scratch instructions with an SGPR offset page fault on GFX9"
>;
-def FeatureNegativeUnalignedScratchOffsetBug : SubtargetFeature<"negative-unaligned-scratch-offset-bug",
- "NegativeUnalignedScratchOffsetBug",
- "true",
- "Scratch instructions with a VGPR offset and a negative immediate offset that is not a multiple of 4 read wrong memory on GFX10"
+defm NegativeUnalignedScratchOffsetBug : AMDGPUSubtargetFeature<"negative-unaligned-scratch-offset-bug",
+ "Scratch instructions with a VGPR offset and a negative immediate offset that"
+ "is not a multiple of 4 read wrong memory on GFX10",
+ /*GenPredicate=*/0
>;
-def FeatureOffset3fBug : SubtargetFeature<"offset-3f-bug",
- "HasOffset3fBug",
- "true",
- "Branch offset of 3f hardware bug"
+defm Offset3fBug : AMDGPUSubtargetFeature<"offset-3f-bug",
+ "Branch offset of 3f hardware bug",
+ /*GenPredicate=*/0
>;
-def FeatureImageStoreD16Bug : SubtargetFeature<"image-store-d16-bug",
- "HasImageStoreD16Bug",
- "true",
- "Image Store D16 hardware bug"
+defm ImageStoreD16Bug : AMDGPUSubtargetFeature<"image-store-d16-bug",
+ "Image Store D16 hardware bug",
+ /*GenPredicate=*/0
>;
-def FeatureImageGather4D16Bug : SubtargetFeature<"image-gather4-d16-bug",
- "HasImageGather4D16Bug",
- "true",
- "Image Gather4 D16 hardware bug"
+defm ImageGather4D16Bug : AMDGPUSubtargetFeature<"image-gather4-d16-bug",
+ "Image Gather4 D16 hardware bug",
+ /*GenPredicate=*/0
>;
-def FeatureMADIntraFwdBug : SubtargetFeature<"mad-intra-fwd-bug",
- "HasMADIntraFwdBug",
- "true",
- "MAD_U64/I64 intra instruction forwarding bug"
+defm MADIntraFwdBug : AMDGPUSubtargetFeature<"mad-intra-fwd-bug",
+ "MAD_U64/I64 intra instruction forwarding bug",
+ /*GenPredicate=*/1,
+ /*GenAssemblerPredicate=*/0
>;
-def FeatureMSAALoadDstSelBug : SubtargetFeature<"msaa-load-dst-sel-bug",
- "HasMSAALoadDstSelBug",
- "true",
- "MSAA loads not honoring dst_sel bug"
+defm MSAALoadDstSelBug : AMDGPUSubtargetFeature<"msaa-load-dst-sel-bug",
+ "MSAA loads not honoring dst_sel bug",
+ /*GenPredicate=*/0
>;
-def FeaturePrivEnabledTrap2NopBug : SubtargetFeature<"priv-enabled-trap2-nop-bug",
- "HasPrivEnabledTrap2NopBug",
- "true",
- "Hardware that runs with PRIV=1 interpreting 's_trap 2' as a nop bug"
+defm PrivEnabledTrap2NopBug : AMDGPUSubtargetFeature<"priv-enabled-trap2-nop-bug",
+ "Hardware that runs with PRIV=1 interpreting 's_trap 2' as a nop bug",
+ /*GenPredicate=*/0
>;
class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature <
@@ -392,28 +377,24 @@ class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature <
def FeatureLDSBankCount16 : SubtargetFeatureLDSBankCount<16>;
def FeatureLDSBankCount32 : SubtargetFeatureLDSBankCount<32>;
-def FeatureGCN3Encoding : SubtargetFeature<"gcn3-encoding",
- "GCN3Encoding",
- "true",
- "Encoding format for VI"
+defm GCN3Encoding : AMDGPUSubtargetFeature<"gcn3-encoding",
+ "Encoding format for VI",
+ /*GenPredicate=*/0
>;
-def FeatureCIInsts : SubtargetFeature<"ci-insts",
- "CIInsts",
- "true",
- "Additional instructions for CI+"
+defm CIInsts : AMDGPUSubtargetFeature<"ci-insts",
+ "Additional instructions for CI+",
+ /*GenPredicate=*/0
>;
-def FeatureGFX8Insts : SubtargetFeature<"gfx8-insts",
- "GFX8Insts",
- "true",
- "Additional instructions for GFX8+"
+defm GFX8Insts : AMDGPUSubtargetFeature<"gfx8-insts",
+ "Additional instructions for GFX8+",
+ /*GenPredicate=*/0
>;
-def FeatureGFX9Insts : SubtargetFeature<"gfx9-insts",
- "GFX9Insts",
- "true",
- "Additional instructions for GFX9+"
+defm GFX9Insts : AMDGPUSubtargetFeature<"gfx9-insts",
+ "Additional instructions for GFX9+",
+ /*GenPredicate=*/0
>;
def FeatureRequiresAlignedVGPRs : SubtargetFeature<"vgpr-align2",
@@ -422,83 +403,72 @@ def FeatureRequiresAlignedVGPRs : SubtargetFeature<"vgpr-align2",
"VGPR and AGPR tuple operands require even alignment"
>;
-def FeatureGFX90AInsts : SubtargetFeature<"gfx90a-insts",
- "GFX90AInsts",
- "true",
- "Additional instructions for GFX90A+"
- // [HasAtomicFMinFMaxF64GlobalInsts, HasAtomicFMinFMaxF64FlatInsts] // TODO
+defm GFX90AInsts : AMDGPUSubtargetFeature<"gfx90a-insts",
+ "Additional instructions for GFX90A+",
+ /*GenPredicate=*/0
>;
-def FeatureGFX940Insts : SubtargetFeature<"gfx940-insts",
- "GFX940Insts",
- "true",
- "Additional instructions for GFX940+"
+defm GFX940Insts : AMDGPUSubtargetFeature<"gfx940-insts",
+ "Additional instructions for GFX940+",
+ /*GenPredicate=*/0
>;
-def FeaturePermlane16Swap : SubtargetFeature<"permlane16-swap",
- "HasPermlane16Swap",
- "true",
+defm Permlane16Swap : AMDGPUSubtargetFeature<"permlane16-swap",
"Has v_permlane16_swap_b32 instructions"
>;
-def FeaturePermlane32Swap : SubtargetFeature<"permlane32-swap",
- "HasPermlane32Swap",
- "true",
+defm Permlane32Swap : AMDGPUSubtargetFeature<"permlane32-swap",
"Has v_permlane32_swap_b32 instructions"
>;
-def FeatureFP8ConversionScaleInsts : SubtargetFeature<"fp8-cvt-scale-insts",
- "HasFP8ConversionScaleInsts",
- "true",
+defm FP8ConversionScaleInsts : AMDGPUSubtargetFeature<"fp8-cvt-scale-insts",
"Has fp8 conversion scale instructions"
>;
-def FeatureBF8ConversionScaleInsts : SubtargetFeature<"bf8-cvt-scale-insts",
- "HasBF8ConversionScaleInsts",
- "true",
+defm BF8ConversionScaleInsts : AMDGPUSubtargetFeature<"bf8-cvt-scale-insts",
"Has bf8 conversion scale instructions"
>;
-def FeatureFP4ConversionScaleInsts : SubtargetFeature<"fp4-cvt-scale-insts",
- "HasFP4ConversionScaleInsts",
- "true",
+defm FP4ConversionScaleInsts : AMDGPUSubtargetFeature<"fp4-cvt-scale-insts",
"Has fp4 conversion scale instructions"
>;
-def FeatureFP6BF6ConversionScaleInsts : SubtargetFeature<"fp6bf6-cvt-scale-insts",
- "HasFP6BF6ConversionScaleInsts",
- "true",
+defm FP6BF6ConversionScaleInsts : AMDGPUSubtargetFeature<"fp6bf6-cvt-scale-insts",
"Has fp6 and bf6 conversion scale instructions"
>;
-def FeatureF16BF16ToFP6BF6ConversionScaleInsts : SubtargetFeature<"f16bf16-to-fp6bf6-cvt-scale-insts",
- "HasF16BF16ToFP6BF6ConversionScaleInsts",
- "true",
+defm F16BF16ToFP6BF6ConversionScaleInsts : AMDGPUSubtargetFeature<"f16bf16-to-fp6bf6-cvt-scale-insts",
"Has f16bf16 to fp6bf6 conversion scale instructions"
>;
-def FeatureF32ToF16BF16ConversionSRInsts : SubtargetFeature<"f32-to-f16bf16-cvt-sr-insts",
- "HasF32ToF16BF16ConversionSRInsts",
- "true",
+defm F32ToF16BF16ConversionSRInsts : AMDGPUSubtargetFeature<"f32-to-f16bf16-cvt-sr-insts",
"Has f32 to f16bf16 conversion scale instructions"
>;
-def FeatureAshrPkInsts : SubtargetFeature<"ashr-pk-insts",
- "HasAshrPkInsts",
- "true",
+defm AshrPkInsts : AMDGPUSubtargetFeature<"ashr-pk-insts",
"Has Arithmetic Shift Pack instructions"
>;
-def FeatureCvtPkF16F32Inst : SubtargetFeature<"cvt-pk-f16-f32-inst",
- "HasCvtPkF16F32Inst",
- "true",
+defm CvtPkF16F32Inst : AMDGPUSubtargetFeature<"cvt-pk-f16-f32-inst",
"Has cvt_pk_f16_f32 instruction"
>;
-def FeatureGFX950Insts : SubtargetFeature<"gfx950-insts",
- "GFX950Insts",
- "true",
+defm McastLoadInsts : AMDGPUSubtargetFeature<"mcast-load-insts",
+ "Has multicast load instructions"
+>;
+
+defm SWakeupImm : AMDGPUSubtargetFeature<"s-wakeup-imm",
+ "s_wakeup takes an immediate operand"
+>;
+
+defm SBarrierLeaveImm : AMDGPUSubtargetFeature<"s-barrier-leave-imm",
+ "s_barrier_leave takes an immediate operand"
+>;
+
+defm GFX950Insts : AMDGPUSubtargetFeature<"gfx950-insts",
"Additional instructions for GFX950+",
+ /*GenPredicate=*/1,
+ /*GenAssemblerPredicate=*/1,
[FeaturePermlane16Swap,
FeaturePermlane32Swap,
FeatureAshrPkInsts,
@@ -514,63 +484,59 @@ def FeatureGFX950Insts : SubtargetFeature<"gfx950-insts",
]
>;
-def FeatureGFX10Insts : SubtargetFeature<"gfx10-insts",
- "GFX10Insts",
- "true",
- "Additional instructions for GFX10+"
+defm GFX10Insts : AMDGPUSubtargetFeature<"gfx10-insts",
+ "Additional instructions for GFX10+",
+ /*GenPredicate=*/0
>;
-def FeatureGFX11Insts : SubtargetFeature<"gfx11-insts",
- "GFX11Insts",
- "true",
- "Additional instructions for GFX11+"
+defm GFX11Insts : AMDGPUSubtargetFeature<"gfx11-insts",
+ "Additional instructions for GFX11+",
+ /*GenPredicate=*/0
>;
-def FeatureGFX12Insts : SubtargetFeature<"gfx12-insts",
- "GFX12Insts",
- "true",
- "Additional instructions for GFX12+"
+defm GFX12Insts : AMDGPUSubtargetFeature<"gfx12-insts",
+ "Additional instructions for GFX12+",
+ /*GenPredicate=*/0
>;
-def FeatureGFX1250Insts : SubtargetFeature<"gfx1250-insts",
- "GFX1250Insts",
- "true",
- "Additional instructions for GFX1250+"
+defm GFX1250Insts : AMDGPUSubtargetFeature<"gfx1250-insts",
+ "Additional instructions for GFX1250+",
+ /*GenPredicate=*/0
>;
-def FeatureGFX10_3Insts : SubtargetFeature<"gfx10-3-insts",
- "GFX10_3Insts",
- "true",
- "Additional instructions for GFX10.3"
+defm GFX13Insts : AMDGPUSubtargetFeature<"gfx13-insts",
+ "Additional instructions for GFX13+",
+ /*GenPredicate=*/0,
+ /*GenAssemblerPredicate=*/0,
+ [FeatureSWakeupImm,
+ FeatureSBarrierLeaveImm,
+ ]
>;
-def FeatureGFX7GFX8GFX9Insts : SubtargetFeature<"gfx7-gfx8-gfx9-insts",
- "GFX7GFX8GFX9Insts",
- "true",
- "Instructions shared in GFX7, GFX8, GFX9"
+defm GFX10_3Insts : AMDGPUSubtargetFeature<"gfx10-3-insts",
+ "Additional instructions for GFX10.3",
+ /*GenPredicate=*/0
>;
-def FeatureSMemRealTime : SubtargetFeature<"s-memrealtime",
- "HasSMemRealTime",
- "true",
+defm GFX7GFX8GFX9Insts : AMDGPUSubtargetFeature<"gfx7-gfx8-gfx9-insts",
+ "Instructions shared in GFX7, GFX8, GFX9",
+ /*GenPredicate=*/0
+>;
+
+defm SMemRealTime : AMDGPUSubtargetFeature<"s-memrealtime",
"Has s_memrealtime instruction"
>;
-def FeatureInv2PiInlineImm : SubtargetFeature<"inv-2pi-inline-imm",
- "HasInv2PiInlineImm",
- "true",
- "Has 1 / (2 * pi) as inline immediate"
+defm Inv2PiInlineImm : AMDGPUSubtargetFeature<"inv-2pi-inline-imm",
+ "Has 1 / (2 * pi) as inline immediate",
+ /*GenPredicate=*/0
>;
-def Feature16BitInsts : SubtargetFeature<"16-bit-insts",
- "Has16BitInsts",
- "true",
+defm 16BitInsts : AMDGPUSubtargetFeature<"16-bit-insts",
"Has i16/f16 instructions"
>;
-def FeatureTrue16BitInsts : SubtargetFeature<"true16",
- "HasTrue16BitInsts",
- "true",
+defm True16BitInsts : AMDGPUSubtargetFeature<"true16",
"True 16-bit operand instructions"
>;
@@ -580,100 +546,75 @@ def FeatureRealTrue16Insts : SubtargetFeature<"real-true16",
"Use true 16-bit registers"
>;
-def FeatureD16Writes32BitVgpr : SubtargetFeature<"d16-write-vgpr32",
- "EnableD16Writes32BitVgpr",
- "true",
+defm D16Writes32BitVgpr : AMDGPUSubtargetFeature<"d16-write-vgpr32",
"D16 instructions potentially have 32-bit data dependencies"
>;
-def FeatureBF16TransInsts : SubtargetFeature<"bf16-trans-insts",
- "HasBF16TransInsts",
- "true",
+defm BF16TransInsts : AMDGPUSubtargetFeature<"bf16-trans-insts",
"Has bf16 transcendental instructions"
>;
-def FeatureBF16ConversionInsts : SubtargetFeature<"bf16-cvt-insts",
- "HasBF16ConversionInsts",
- "true",
+defm BF16ConversionInsts : AMDGPUSubtargetFeature<"bf16-cvt-insts",
"Has bf16 conversion instructions"
>;
-def FeatureBF16PackedInsts : SubtargetFeature<"bf16-pk-insts",
- "HasBF16PackedInsts",
- "true",
+defm BF16PackedInsts : AMDGPUSubtargetFeature<"bf16-pk-insts",
"Has bf16 packed instructions (fma, add, mul, max, min)"
>;
-def FeatureVOP3P : SubtargetFeature<"vop3p",
- "HasVOP3PInsts",
- "true",
+defm VOP3PInsts : AMDGPUSubtargetFeature<"vop3p",
"Has VOP3P packed instructions"
>;
-def FeatureMovrel : SubtargetFeature<"movrel",
- "HasMovrel",
- "true",
+defm Movrel : AMDGPUSubtargetFeature<"movrel",
"Has v_movrel*_b32 instructions"
>;
-def FeatureVGPRIndexMode : SubtargetFeature<"vgpr-index-mode",
- "HasVGPRIndexMode",
- "true",
+defm VGPRIndexMode : AMDGPUSubtargetFeature<"vgpr-index-mode",
"Has VGPR mode register indexing"
>;
-def FeatureScalarDwordx3Loads : SubtargetFeature<"scalar-dwordx3-loads",
- "HasScalarDwordx3Loads",
- "true",
+defm ScalarDwordx3Loads : AMDGPUSubtargetFeature<"scalar-dwordx3-loads",
"Has 96-bit scalar load instructions"
>;
-def FeatureScalarStores : SubtargetFeature<"scalar-stores",
- "HasScalarStores",
- "true",
+defm ScalarStores : AMDGPUSubtargetFeature<"scalar-stores",
"Has store scalar memory instructions"
>;
-def FeatureScalarAtomics : SubtargetFeature<"scalar-atomics",
- "HasScalarAtomics",
- "true",
+defm ScalarAtomics : AMDGPUSubtargetFeature<"scalar-atomics",
"Has atomic scalar memory instructions"
>;
-def FeatureSDWA : SubtargetFeature<"sdwa",
- "HasSDWA",
- "true",
- "Support SDWA (Sub-DWORD Addressing) extension"
+defm SDWA : AMDGPUSubtargetFeature<"sdwa",
+ "Support SDWA (Sub-DWORD Addressing) extension",
+ /*GenPredicate=*/1,
+ /*GenAssemblerPredicate=*/0
>;
-def FeatureSDWAOmod : SubtargetFeature<"sdwa-omod",
- "HasSDWAOmod",
- "true",
- "Support OMod with SDWA (Sub-DWORD Addressing) extension"
+defm SDWAOmod : AMDGPUSubtargetFeature<"sdwa-omod",
+ "Support OMod with SDWA (Sub-DWORD Addressing) extension",
+ /*GenPredicate=*/0
>;
-def FeatureSDWAScalar : SubtargetFeature<"sdwa-scalar",
- "HasSDWAScalar",
- "true",
- "Support scalar register with SDWA (Sub-DWORD Addressing) extension"
+defm SDWAScalar : AMDGPUSubtargetFeature<"sdwa-scalar",
+ "Support scalar register with SDWA (Sub-DWORD Addressing) extension",
+ /*GenPredicate=*/0
>;
-def FeatureSDWASdst : SubtargetFeature<"sdwa-sdst",
- "HasSDWASdst",
- "true",
- "Support scalar dst for VOPC with SDWA (Sub-DWORD Addressing) extension"
+defm SDWASdst : AMDGPUSubtargetFeature<"sdwa-sdst",
+ "Support scalar dst for VOPC with SDWA (Sub-DWORD Addressing) extension",
+ /*GenPredicate=*/0
>;
-def FeatureSDWAMac : SubtargetFeature<"sdwa-mav",
- "HasSDWAMac",
- "true",
- "Support v_mac_f32/f16 with SDWA (Sub-DWORD Addressing) extension"
+defm SDWAMac : AMDGPUSubtargetFeature<"sdwa-mav",
+ "Support v_mac_f32/f16 with SDWA (Sub-DWORD Addressing) extension",
+ /*GenPredicate=*/0
>;
-def FeatureSDWAOutModsVOPC : SubtargetFeature<"sdwa-out-mods-vopc",
- "HasSDWAOutModsVOPC",
- "true",
- "Support clamp for VOPC with SDWA (Sub-DWORD Addressing) extension"
+defm SDWAOutModsVOPC : AMDGPUSubtargetFeature<"sdwa-out-mods-vopc",
+ "Support clamp for VOPC with SDWA (Sub-DWORD Addressing) extension",
+ /*GenPredicate=*/0
>;
def FeatureDPP : SubtargetFeature<"dpp",
@@ -689,270 +630,227 @@ def FeatureDPP8 : SubtargetFeature<"dpp8",
"Support DPP8 (Data Parallel Primitives) extension"
>;
-def FeatureDPALU_DPP : SubtargetFeature<"dpp-64bit",
- "HasDPALU_DPP",
- "true",
+defm DPALU_DPP : AMDGPUSubtargetFeature<"dpp-64bit",
"Support DPP (Data Parallel Primitives) extension in DP ALU"
>;
-def FeatureDPPSrc1SGPR : SubtargetFeature<"dpp-src1-sgpr",
- "HasDPPSrc1SGPR",
- "true",
- "Support SGPR for Src1 of DPP instructions"
+defm DPPSrc1SGPR : AMDGPUSubtargetFeature<"dpp-src1-sgpr",
+ "Support SGPR for Src1 of DPP instructions",
+ /*GenPredicate=*/0
>;
-def FeaturePackedFP32Ops : SubtargetFeature<"packed-fp32-ops",
- "HasPackedFP32Ops",
- "true",
+defm PackedFP32Ops : AMDGPUSubtargetFeature<"packed-fp32-ops",
"Support packed fp32 instructions"
>;
-def FeatureR128A16 : SubtargetFeature<"r128-a16",
- "HasR128A16",
- "true",
- "Support gfx9-style A16 for 16-bit coordinates/gradients/lod/clamp/mip image operands, where a16 is aliased with r128"
+defm R128A16 : AMDGPUSubtargetFeature<"r128-a16",
+ "Support gfx9-style A16 for 16-bit coordinates/gradients/lod/clamp/mip image "
+ "operands, where a16 is aliased with r128"
>;
-def FeatureA16 : SubtargetFeature<"a16",
- "HasA16",
- "true",
+defm A16 : AMDGPUSubtargetFeature<"a16",
"Support A16 for 16-bit coordinates/gradients/lod/clamp/mip image operands"
>;
-def FeatureG16 : SubtargetFeature<"g16",
- "HasG16",
- "true",
+defm G16 : AMDGPUSubtargetFeature<"g16",
"Support G16 for 16-bit gradient image operands"
>;
-def FeatureNSAEncoding : SubtargetFeature<"nsa-encoding",
- "HasNSAEncoding",
- "true",
- "Support NSA encoding for image instructions"
+defm NSAEncoding : AMDGPUSubtargetFeature<"nsa-encoding",
+ "Support NSA encoding for image instructions",
+ /*GenPredicate=*/0
>;
-def FeaturePartialNSAEncoding : SubtargetFeature<"partial-nsa-encoding",
- "HasPartialNSAEncoding",
- "true",
- "Support partial NSA encoding for image instructions"
+defm PartialNSAEncoding : AMDGPUSubtargetFeature<"partial-nsa-encoding",
+ "Support partial NSA encoding for image instructions",
+ /*GenPredicate=*/0
>;
-def FeatureImageInsts : SubtargetFeature<"image-insts",
- "HasImageInsts",
- "true",
+defm ImageInsts : AMDGPUSubtargetFeature<"image-insts",
"Support image instructions"
>;
-def FeatureExtendedImageInsts : SubtargetFeature<"extended-image-insts",
- "HasExtendedImageInsts",
- "true",
+defm ExtendedImageInsts : AMDGPUSubtargetFeature<"extended-image-insts",
"Support mips != 0, lod != 0, gather4, and get_lod"
>;
-def FeatureGFX10_AEncoding : SubtargetFeature<"gfx10_a-encoding",
- "GFX10_AEncoding",
- "true",
- "Has BVH ray tracing instructions"
+defm GFX10_AEncoding : AMDGPUSubtargetFeature<"gfx10_a-encoding",
+ "Has BVH ray tracing instructions",
+ /*GenPredicate=*/0
>;
-def FeatureGFX10_BEncoding : SubtargetFeature<"gfx10_b-encoding",
- "GFX10_BEncoding",
- "true",
- "Encoding format GFX10_B"
+defm GFX10_BEncoding : AMDGPUSubtargetFeature<"gfx10_b-encoding",
+ "Encoding format GFX10_B",
+ /*GenPredicate=*/0
>;
-def FeatureIntClamp : SubtargetFeature<"int-clamp-insts",
- "HasIntClamp",
- "true",
+defm IntClamp : AMDGPUSubtargetFeature<"int-clamp-insts",
"Support clamp for integer destination"
>;
-def FeatureUnpackedD16VMem : SubtargetFeature<"unpacked-d16-vmem",
- "HasUnpackedD16VMem",
- "true",
+defm UnpackedD16VMem : AMDGPUSubtargetFeature<"unpacked-d16-vmem",
"Has unpacked d16 vmem instructions"
>;
-def FeatureDLInsts : SubtargetFeature<"dl-insts",
- "HasDLInsts",
- "true",
+defm DLInsts : AMDGPUSubtargetFeature<"dl-insts",
"Has v_fmac_f32 and v_xnor_b32 instructions"
>;
-def FeatureFmacF64Inst : SubtargetFeature<"fmacf64-inst",
- "HasFmacF64Inst",
- "true",
+defm FmacF64Inst : AMDGPUSubtargetFeature<"fmacf64-inst",
"Has v_fmac_f64 instruction"
>;
-def FeatureDot1Insts : SubtargetFeature<"dot1-insts",
- "HasDot1Insts",
- "true",
+defm Dot1Insts : AMDGPUSubtargetFeature<"dot1-insts",
"Has v_dot4_i32_i8 and v_dot8_i32_i4 instructions"
>;
-def FeatureDot2Insts : SubtargetFeature<"dot2-insts",
- "HasDot2Insts",
- "true",
+defm Dot2Insts : AMDGPUSubtargetFeature<"dot2-insts",
"Has v_dot2_i32_i16, v_dot2_u32_u16 instructions"
>;
-def FeatureDot3Insts : SubtargetFeature<"dot3-insts",
- "HasDot3Insts",
- "true",
+defm Dot3Insts : AMDGPUSubtargetFeature<"dot3-insts",
"Has v_dot8c_i32_i4 instruction"
>;
-def FeatureDot4Insts : SubtargetFeature<"dot4-insts",
- "HasDot4Insts",
- "true",
+defm Dot4Insts : AMDGPUSubtargetFeature<"dot4-insts",
"Has v_dot2c_i32_i16 instruction"
>;
-def FeatureDot5Insts : SubtargetFeature<"dot5-insts",
- "HasDot5Insts",
- "true",
+defm Dot5Insts : AMDGPUSubtargetFeature<"dot5-insts",
"Has v_dot2c_f32_f16 instruction"
>;
-def FeatureDot6Insts : SubtargetFeature<"dot6-insts",
- "HasDot6Insts",
- "true",
+defm Dot6Insts : AMDGPUSubtargetFeature<"dot6-insts",
"Has v_dot4c_i32_i8 instruction"
>;
-def FeatureDot7Insts : SubtargetFeature<"dot7-insts",
- "HasDot7Insts",
- "true",
+defm Dot7Insts : AMDGPUSubtargetFeature<"dot7-insts",
"Has v_dot4_u32_u8, v_dot8_u32_u4 instructions"
>;
-def FeatureDot8Insts : SubtargetFeature<"dot8-insts",
- "HasDot8Insts",
- "true",
+defm Dot8Insts : AMDGPUSubtargetFeature<"dot8-insts",
"Has v_dot4_i32_iu8, v_dot8_i32_iu4 instructions"
>;
-def FeatureDot9Insts : SubtargetFeature<"dot9-insts",
- "HasDot9Insts",
- "true",
+defm Dot9Insts : AMDGPUSubtargetFeature<"dot9-insts",
"Has v_dot2_f16_f16, v_dot2_bf16_bf16 instructions"
>;
-def FeatureDot10Insts : SubtargetFeature<"dot10-insts",
- "HasDot10Insts",
- "true",
+defm Dot10Insts : AMDGPUSubtargetFeature<"dot10-insts",
"Has v_dot2_f32_f16 instruction"
>;
-def FeatureDot11Insts : SubtargetFeature<"dot11-insts",
- "HasDot11Insts",
- "true",
+defm Dot11Insts : AMDGPUSubtargetFeature<"dot11-insts",
"Has v_dot4_f32_fp8_fp8, v_dot4_f32_fp8_bf8, v_dot4_f32_bf8_fp8, v_dot4_f32_bf8_bf8 instructions"
>;
-def FeatureDot12Insts : SubtargetFeature<"dot12-insts",
- "HasDot12Insts",
- "true",
+defm Dot12Insts : AMDGPUSubtargetFeature<"dot12-insts",
"Has v_dot2_f32_bf16 instructions"
>;
-def FeatureDot13Insts : SubtargetFeature<"dot13-insts",
- "HasDot13Insts",
- "true",
+defm Dot13Insts : AMDGPUSubtargetFeature<"dot13-insts",
"Has v_dot2c_f32_bf16 instructions"
>;
-
-def FeatureMAIInsts : SubtargetFeature<"mai-insts",
- "HasMAIInsts",
- "true",
+defm MAIInsts : AMDGPUSubtargetFeature<"mai-insts",
"Has mAI instructions"
>;
-def FeatureFP8Insts : SubtargetFeature<"fp8-insts",
- "HasFP8Insts",
- "true",
+defm FP8Insts : AMDGPUSubtargetFeature<"fp8-insts",
"Has fp8 and bf8 instructions"
>;
-def FeatureFP8ConversionInsts : SubtargetFeature<"fp8-conversion-insts",
- "HasFP8ConversionInsts",
- "true",
+defm FP8ConversionInsts : AMDGPUSubtargetFeature<"fp8-conversion-insts",
"Has fp8 and bf8 conversion instructions"
>;
-def FeatureFP8E5M3Insts : SubtargetFeature<"fp8e5m3-insts",
- "HasFP8E5M3Insts",
- "true",
+defm FP8E5M3Insts : AMDGPUSubtargetFeature<"fp8e5m3-insts",
"Has fp8 e5m3 format support"
>;
-def FeatureCvtFP8VOP1Bug : SubtargetFeature<"cvt-fp8-vop1-bug",
- "HasCvtFP8Vop1Bug",
- "true",
+defm CvtFP8VOP1Bug : AMDGPUSubtargetFeature<"cvt-fp8-vop1-bug",
"FP8/BF8 VOP1 form of conversion to F32 is unreliable",
+ /*GenPredicate=*/1,
+ /*GenAssemblerPredicate=*/0,
[FeatureFP8ConversionInsts]
>;
-def FeaturePkFmacF16Inst : SubtargetFeature<"pk-fmac-f16-inst",
- "HasPkFmacF16Inst",
- "true",
+defm PkFmacF16Inst : AMDGPUSubtargetFeature<"pk-fmac-f16-inst",
"Has v_pk_fmac_f16 instruction"
>;
-def FeatureAtomicDsPkAdd16Insts : SubtargetFeature<"atomic-ds-pk-add-16-insts",
- "HasAtomicDsPkAdd16Insts",
- "true",
+defm CubeInsts : AMDGPUSubtargetFeature<"cube-insts",
+ "Has v_cube* instructions"
+>;
+
+defm LerpInst : AMDGPUSubtargetFeature<"lerp-inst",
+ "Has v_lerp_u8 instruction"
+>;
+
+defm SadInsts : AMDGPUSubtargetFeature<"sad-insts",
+ "Has v_sad* instructions"
+>;
+
+defm QsadInsts : AMDGPUSubtargetFeature<"qsad-insts",
+ "Has v_qsad* instructions"
+>;
+
+defm CvtNormInsts : AMDGPUSubtargetFeature<"cvt-norm-insts",
+ "Has v_cvt_norm* instructions"
+>;
+
+defm CvtPkNormVOP2Insts : AMDGPUSubtargetFeature<"cvt-pknorm-vop2-insts",
+ "Has v_cvt_pk_norm_*f32 instructions/Has v_cvt_pk_norm_*_f16 instructions"
+>;
+
+defm CvtPkNormVOP3Insts : AMDGPUSubtargetFeature<"cvt-pknorm-vop3-insts",
+ "Has v_cvt_pk_norm_*f32 instructions/Has v_cvt_pk_norm_*_f16 instructions"
+>;
+
+defm AtomicDsPkAdd16Insts : AMDGPUSubtargetFeature<"atomic-ds-pk-add-16-insts",
"Has ds_pk_add_bf16, ds_pk_add_f16, ds_pk_add_rtn_bf16, "
"ds_pk_add_rtn_f16 instructions"
>;
-def FeatureAtomicFlatPkAdd16Insts : SubtargetFeature<"atomic-flat-pk-add-16-insts",
- "HasAtomicFlatPkAdd16Insts",
- "true",
+defm AtomicFlatPkAdd16Insts : AMDGPUSubtargetFeature<"atomic-flat-pk-add-16-insts",
"Has flat_atomic_pk_add_f16 and flat_atomic_pk_add_bf16 instructions"
>;
-def FeatureAtomicFaddRtnInsts : SubtargetFeature<"atomic-fadd-rtn-insts",
- "HasAtomicFaddRtnInsts",
- "true",
+defm AtomicFaddRtnInsts : AMDGPUSubtargetFeature<"atomic-fadd-rtn-insts",
"Has buffer_atomic_add_f32 and global_atomic_add_f32 instructions that "
"return original value",
+ /*GenPredicate=*/1,
+ /*GenAssemblerPredicate=*/1,
[FeatureFlatGlobalInsts]
>;
-def FeatureAtomicFMinFMaxF32GlobalInsts : SubtargetFeature<"atomic-fmin-fmax-global-f32",
- "HasAtomicFMinFMaxF32GlobalInsts",
- "true",
+defm AtomicFMinFMaxF32GlobalInsts : AMDGPUSubtargetFeature<"atomic-fmin-fmax-global-f32",
"Has global/buffer instructions for atomicrmw fmin/fmax for float"
>;
-def FeatureAtomicFMinFMaxF64GlobalInsts : SubtargetFeature<"atomic-fmin-fmax-global-f64",
- "HasAtomicFMinFMaxF64GlobalInsts",
- "true",
+defm AtomicFMinFMaxF64GlobalInsts : AMDGPUSubtargetFeature<"atomic-fmin-fmax-global-f64",
"Has global/buffer instructions for atomicrmw fmin/fmax for float"
>;
-def FeatureAtomicFMinFMaxF32FlatInsts : SubtargetFeature<"atomic-fmin-fmax-flat-f32",
- "HasAtomicFMinFMaxF32FlatInsts",
- "true",
+defm AtomicFMinFMaxF32FlatInsts : AMDGPUSubtargetFeature<"atomic-fmin-fmax-flat-f32",
"Has flat memory instructions for atomicrmw fmin/fmax for float",
+ /*GenPredicate=*/1,
+ /*GenAssemblerPredicate=*/1,
[FeatureFlatAddressSpace]
>;
-def FeatureAtomicFMinFMaxF64FlatInsts : SubtargetFeature<"atomic-fmin-fmax-flat-f64",
- "HasAtomicFMinFMaxF64FlatInsts",
- "true",
+defm AtomicFMinFMaxF64FlatInsts : AMDGPUSubtargetFeature<"atomic-fmin-fmax-flat-f64",
"Has flat memory instructions for atomicrmw fmin/fmax for double",
+ /*GenPredicate=*/1,
+ /*GenAssemblerPredicate=*/1,
[FeatureFlatAddressSpace]
>;
-def FeatureAtomicFaddNoRtnInsts : SubtargetFeature<"atomic-fadd-no-rtn-insts",
- "HasAtomicFaddNoRtnInsts",
- "true",
+defm AtomicFaddNoRtnInsts : AMDGPUSubtargetFeature<"atomic-fadd-no-rtn-insts",
"Has buffer_atomic_add_f32 and global_atomic_add_f32 instructions that "
"don't return original value",
+ /*GenPredicate=*/1,
+ /*GenAssemblerPredicate=*/1,
[FeatureFlatGlobalInsts]
>;
@@ -965,46 +863,40 @@ def FeatureAtomicBufferGlobalPkAddF16NoRtnInsts
[FeatureFlatGlobalInsts]
>;
-def FeatureAtomicBufferGlobalPkAddF16Insts : SubtargetFeature<"atomic-buffer-global-pk-add-f16-insts",
- "HasAtomicBufferGlobalPkAddF16Insts",
- "true",
- "Has buffer_atomic_pk_add_f16 and global_atomic_pk_add_f16 instructions that "
- "can return original value",
- [FeatureFlatGlobalInsts]
+defm AtomicBufferGlobalPkAddF16Insts : AMDGPUSubtargetFeature<"atomic-buffer-global-pk-add-f16-insts",
+ "Has buffer_atomic_pk_add_f16 and global_atomic_pk_add_f16 instructions that "
+ "can return original value",
+ /*GenPredicate=*/1,
+ /*GenAssemblerPredicate=*/1,
+ [FeatureFlatGlobalInsts]
>;
-def FeatureAtomicGlobalPkAddBF16Inst : SubtargetFeature<"atomic-global-pk-add-bf16-inst",
- "HasAtomicGlobalPkAddBF16Inst",
- "true",
- "Has global_atomic_pk_add_bf16 instruction",
- [FeatureFlatGlobalInsts]
+defm AtomicGlobalPkAddBF16Inst : AMDGPUSubtargetFeature<"atomic-global-pk-add-bf16-inst",
+ "Has global_atomic_pk_add_bf16 instruction",
+ /*GenPredicate=*/1,
+ /*GenAssemblerPredicate=*/1,
+ [FeatureFlatGlobalInsts]
>;
-def FeatureAtomicBufferPkAddBF16Inst : SubtargetFeature<"atomic-buffer-pk-add-bf16-inst",
- "HasAtomicBufferPkAddBF16Inst",
- "true",
- "Has buffer_atomic_pk_add_bf16 instruction"
+defm AtomicBufferPkAddBF16Inst : AMDGPUSubtargetFeature<"atomic-buffer-pk-add-bf16-inst",
+ "Has buffer_atomic_pk_add_bf16 instruction"
>;
-def FeatureAtomicCSubNoRtnInsts : SubtargetFeature<"atomic-csub-no-rtn-insts",
- "HasAtomicCSubNoRtnInsts",
- "true",
+defm AtomicCSubNoRtnInsts : AMDGPUSubtargetFeature<"atomic-csub-no-rtn-insts",
"Has buffer_atomic_csub and global_atomic_csub instructions that don't "
- "return original value"
+ "return original value",
+ /*GenPredicate=*/1,
+ /*GenAssemblerPredicate=*/0
>;
-def FeatureFlatAtomicFaddF32Inst
- : SubtargetFeature<"flat-atomic-fadd-f32-inst",
- "HasFlatAtomicFaddF32Inst",
- "true",
+defm FlatAtomicFaddF32Inst : AMDGPUSubtargetFeature<"flat-atomic-fadd-f32-inst",
"Has flat_atomic_add_f32 instruction",
+ /*GenPredicate=*/1,
+ /*GenAssemblerPredicate=*/1,
[FeatureFlatAddressSpace]
>;
-def FeatureFlatBufferGlobalAtomicFaddF64Inst
- : SubtargetFeature<"flat-buffer-global-fadd-f64-inst",
- "HasFlatBufferGlobalAtomicFaddF64Inst",
- "true",
+defm FlatBufferGlobalAtomicFaddF64Inst : AMDGPUSubtargetFeature<"flat-buffer-global-fadd-f64-inst",
"Has flat, buffer, and global instructions for f64 atomic fadd"
>;
@@ -1015,33 +907,27 @@ def FeatureMemoryAtomicFAddF32DenormalSupport
"global/flat/buffer atomic fadd for float supports denormal handling"
>;
-def FeatureAgentScopeFineGrainedRemoteMemoryAtomics
- : SubtargetFeature<"agent-scope-fine-grained-remote-memory-atomics",
- "HasAgentScopeFineGrainedRemoteMemoryAtomics",
- "true",
+defm AgentScopeFineGrainedRemoteMemoryAtomics : AMDGPUSubtargetFeature<
+ "agent-scope-fine-grained-remote-memory-atomics",
"Agent (device) scoped atomic operations, excluding those directly "
"supported by PCIe (i.e. integer atomic add, exchange, and "
"compare-and-swap), are functional for allocations in host or peer "
- "device memory."
+ "device memory.",
+ /*GenPredicate=*/0
>;
-def FeatureEmulatedSystemScopeAtomics
- : SubtargetFeature<"emulated-system-scope-atomics",
- "HasEmulatedSystemScopeAtomics",
- "true",
+defm EmulatedSystemScopeAtomics : AMDGPUSubtargetFeature<
+ "emulated-system-scope-atomics",
"System scope atomics unsupported by the PCI-e are emulated in HW via CAS "
- "loop and functional."
+ "loop and functional.",
+ /*GenPredicate=*/0
>;
-def FeatureDefaultComponentZero : SubtargetFeature<"default-component-zero",
- "HasDefaultComponentZero",
- "true",
+defm DefaultComponentZero : AMDGPUSubtargetFeature<"default-component-zero",
"BUFFER/IMAGE store instructions set unspecified components to zero (before GFX12)"
>;
-def FeatureDefaultComponentBroadcast : SubtargetFeature<"default-component-broadcast",
- "HasDefaultComponentBroadcast",
- "true",
+defm DefaultComponentBroadcast : AMDGPUSubtargetFeature<"default-component-broadcast",
"BUFFER/IMAGE store instructions set unspecified components to x component (GFX12)"
>;
@@ -1057,183 +943,144 @@ def FeatureSRAMECC : SubtargetFeature<"sramecc",
"Enable SRAMECC"
>;
-def FeatureNoSdstCMPX : SubtargetFeature<"no-sdst-cmpx",
- "HasNoSdstCMPX",
- "true",
+defm NoSdstCMPX : AMDGPUSubtargetFeature<"no-sdst-cmpx",
"V_CMPX does not write VCC/SGPR in addition to EXEC"
>;
-def FeatureVscnt : SubtargetFeature<"vscnt",
- "HasVscnt",
- "true",
- "Has separate store vscnt counter"
+defm Vscnt : AMDGPUSubtargetFeature<"vscnt",
+ "Has separate store vscnt counter",
+ /*GenPredicate=*/0
>;
-def FeatureGetWaveIdInst : SubtargetFeature<"get-wave-id-inst",
- "HasGetWaveIdInst",
- "true",
+defm GetWaveIdInst : AMDGPUSubtargetFeature<"get-wave-id-inst",
"Has s_get_waveid_in_workgroup instruction"
>;
-def FeatureSMemTimeInst : SubtargetFeature<"s-memtime-inst",
- "HasSMemTimeInst",
- "true",
+defm SMemTimeInst : AMDGPUSubtargetFeature<"s-memtime-inst",
"Has s_memtime instruction"
>;
-def FeatureShaderCyclesRegister : SubtargetFeature<"shader-cycles-register",
- "HasShaderCyclesRegister",
- "true",
+defm ShaderCyclesRegister : AMDGPUSubtargetFeature<"shader-cycles-register",
"Has SHADER_CYCLES hardware register"
>;
-def FeatureShaderCyclesHiLoRegisters : SubtargetFeature<"shader-cycles-hi-lo-registers",
- "HasShaderCyclesHiLoRegisters",
- "true",
- "Has SHADER_CYCLES_HI/LO hardware registers"
+defm ShaderCyclesHiLoRegisters : AMDGPUSubtargetFeature<"shader-cycles-hi-lo-registers",
+ "Has SHADER_CYCLES_HI/LO hardware registers",
+ /*GenPredicate=*/1,
+ /*GenAssemblerPredicate=*/0
>;
-def FeatureMadMacF32Insts : SubtargetFeature<"mad-mac-f32-insts",
- "HasMadMacF32Insts",
- "true",
+defm MadMacF32Insts : AMDGPUSubtargetFeature<"mad-mac-f32-insts",
"Has v_mad_f32/v_mac_f32/v_madak_f32/v_madmk_f32 instructions"
>;
-def FeatureDsSrc2Insts : SubtargetFeature<"ds-src2-insts",
- "HasDsSrc2Insts",
- "true",
+defm DsSrc2Insts : AMDGPUSubtargetFeature<"ds-src2-insts",
"Has ds_*_src2 instructions"
>;
-def FeatureVOP3Literal : SubtargetFeature<"vop3-literal",
- "HasVOP3Literal",
- "true",
- "Can use one literal in VOP3"
+defm VOP3Literal : AMDGPUSubtargetFeature<"vop3-literal",
+ "Can use one literal in VOP3",
+ /*GenPredicate=*/0
>;
-def FeatureNoDataDepHazard : SubtargetFeature<"no-data-dep-hazard",
- "HasNoDataDepHazard",
- "true",
- "Does not need SW waitstates"
+defm NoDataDepHazard : AMDGPUSubtargetFeature<"no-data-dep-hazard",
+ "Does not need SW waitstates",
+ /*GenPredicate=*/0
>;
// Allocate 1536 VGPRs for wave32 and 768 VGPRs for wave64
// with allocation granularity 24 for wave32 and 12 for wave64
-def Feature1_5xVGPRs : SubtargetFeature<"allocate1_5xvgprs",
- "Has1_5xVGPRs",
- "true",
- "Has 50% more physical VGPRs and 50% larger allocation granule"
+defm 1_5xVGPRs : AMDGPUSubtargetFeature<"allocate1_5xvgprs",
+ "Has 50% more physical VGPRs and 50% larger allocation granule",
+ /*GenPredicate=*/0
>;
-
-def FeatureVOPD : SubtargetFeature<"vopd",
- "HasVOPDInsts",
- "true",
- "Has VOPD dual issue wave32 instructions"
+defm VOPDInsts : AMDGPUSubtargetFeature<"vopd",
+ "Has VOPD dual issue wave32 instructions",
+ /*GenPredicate=*/0
>;
-def FeatureVALUTransUseHazard : SubtargetFeature<"valu-trans-use-hazard",
- "HasVALUTransUseHazard",
- "true",
- "Hazard when TRANS instructions are closely followed by a use of the result"
+defm VALUTransUseHazard : AMDGPUSubtargetFeature<"valu-trans-use-hazard",
+ "Hazard when TRANS instructions are closely followed by a use of the result",
+ /*GenPredicate=*/0
>;
-def FeatureSALUFloatInsts : SubtargetFeature<"salu-float",
- "HasSALUFloatInsts",
- "true",
+defm SALUFloatInsts : AMDGPUSubtargetFeature<"salu-float",
"Has SALU floating point instructions"
>;
-def FeaturePseudoScalarTrans : SubtargetFeature<"pseudo-scalar-trans",
- "HasPseudoScalarTrans",
- "true",
+defm PseudoScalarTrans : AMDGPUSubtargetFeature<"pseudo-scalar-trans",
"Has Pseudo Scalar Transcendental instructions"
>;
-def FeatureHasRestrictedSOffset : SubtargetFeature<"restricted-soffset",
- "HasRestrictedSOffset",
- "true",
+defm RestrictedSOffset : AMDGPUSubtargetFeature<"restricted-soffset",
"Has restricted SOffset (immediate not supported)."
>;
-def FeatureRequiredExportPriority : SubtargetFeature<"required-export-priority",
- "HasRequiredExportPriority",
- "true",
- "Export priority must be explicitly manipulated on GFX11.5"
+defm RequiredExportPriority : AMDGPUSubtargetFeature<"required-export-priority",
+ "Export priority must be explicitly manipulated on GFX11.5",
+ /*GenPredicate=*/0
>;
-def FeatureVmemWriteVgprInOrder : SubtargetFeature<"vmem-write-vgpr-in-order",
- "HasVmemWriteVgprInOrder",
- "true",
- "VMEM instructions of the same type write VGPR results in order"
+defm VmemWriteVgprInOrder : AMDGPUSubtargetFeature<"vmem-write-vgpr-in-order",
+ "VMEM instructions of the same type write VGPR results in order",
+ /*GenPredicate=*/0
>;
-def FeatureBitOp3Insts : SubtargetFeature<"bitop3-insts",
- "HasBitOp3Insts",
- "true",
+defm BitOp3Insts : AMDGPUSubtargetFeature<"bitop3-insts",
"Has v_bitop3_b32/v_bitop3_b16 instructions"
>;
-def FeatureTanhInsts : SubtargetFeature<"tanh-insts",
- "HasTanhInsts",
- "true",
+defm TanhInsts : AMDGPUSubtargetFeature<"tanh-insts",
"Has v_tanh_f32/f16 instructions"
>;
-def FeatureTensorCvtLutInsts : SubtargetFeature<"tensor-cvt-lut-insts",
- "HasTensorCvtLutInsts",
- "true",
+defm TensorCvtLutInsts : AMDGPUSubtargetFeature<"tensor-cvt-lut-insts",
"Has v_perm_pk16* instructions"
>;
-def FeatureTransposeLoadF4F6Insts : SubtargetFeature<"transpose-load-f4f6-insts",
- "HasTransposeLoadF4F6Insts",
- "true",
+defm TransposeLoadF4F6Insts : AMDGPUSubtargetFeature<"transpose-load-f4f6-insts",
"Has ds_load_tr4/tr6 and global_load_tr4/tr6 instructions"
>;
-def FeaturePrngInst : SubtargetFeature<"prng-inst",
- "HasPrngInst",
- "true",
+defm PrngInst : AMDGPUSubtargetFeature<"prng-inst",
"Has v_prng_b32 instruction"
>;
-def FeatureBVHDualAndBVH8Insts : SubtargetFeature<"bvh-dual-bvh-8-insts",
- "HasBVHDualAndBVH8Insts",
- "true",
+defm BVHDualAndBVH8Insts : AMDGPUSubtargetFeature<"bvh-dual-bvh-8-insts",
"Has image_bvh_dual_intersect_ray and image_bvh8_intersect_ray instructions"
>;
-def FeaturePointSampleAccel : SubtargetFeature<"point-sample-accel",
- "HasPointSampleAccel",
- "true",
- "Has point sample acceleration feature"
+defm PointSampleAccel : AMDGPUSubtargetFeature<"point-sample-accel",
+ "Has point sample acceleration feature",
+ /*GenPredicate=*/0
>;
-def Feature64BitLiterals : SubtargetFeature<"64-bit-literals",
- "Has64BitLiterals",
- "true",
+defm 64BitLiterals : AMDGPUSubtargetFeature<"64-bit-literals",
"Can use 64-bit literals with single DWORD instructions"
>;
-def Feature1024AddressableVGPRs : SubtargetFeature<"1024-addressable-vgprs",
- "Has1024AddressableVGPRs",
- "true",
+defm 1024AddressableVGPRs : AMDGPUSubtargetFeature<"1024-addressable-vgprs",
"Has 1024 addressable VGPRs"
>;
-def FeatureWaitXcnt : SubtargetFeature<"wait-xcnt",
- "HasWaitXcnt",
- "true",
+defm SetregVGPRMSBFixup : AMDGPUSubtargetFeature<"setreg-vgpr-msb-fixup",
+ "S_SETREG to MODE clobbers VGPR MSB bits, requires fixup",
+ /*GenPredicate=*/0
+>;
+
+defm WaitXcnt : AMDGPUSubtargetFeature<"wait-xcnt",
"Has s_wait_xcnt instruction"
>;
-def FeatureSetPrioIncWgInst : SubtargetFeature<"setprio-inc-wg-inst",
- "HasSetPrioIncWgInst",
- "true",
+defm SetPrioIncWgInst : AMDGPUSubtargetFeature<"setprio-inc-wg-inst",
"Has s_setprio_inc_wg instruction."
>;
+defm SWakeupBarrier : AMDGPUSubtargetFeature<"s-wakeup-barrier-inst",
+ "Has s_wakeup_barrier instruction."
+>;
+
//===------------------------------------------------------------===//
// Subtarget Features (options and debugging)
//===------------------------------------------------------------===//
@@ -1244,11 +1091,9 @@ def FeatureSetPrioIncWgInst : SubtargetFeature<"setprio-inc-wg-inst",
// wave32 and wave64. Instead what users do is assemble with both
// wavesizes enabled. We translate this into this special mode so this
// only influences assembler behavior and nothing else.
-def FeatureAssemblerPermissiveWavesize : SubtargetFeature<
- "assembler-permissive-wavesize",
- "AssemblerPermissiveWavesize",
- "true",
- "allow parsing wave32 and wave64 variants of instructions"
+defm AssemblerPermissiveWavesize : AMDGPUSubtargetFeature<"assembler-permissive-wavesize",
+ "Allow parsing wave32 and wave64 variants of instructions",
+ /*GenPredicate=*/0
>;
class FeatureMaxPrivateElementSize<int size> : SubtargetFeature<
@@ -1262,12 +1107,6 @@ def FeatureMaxPrivateElementSize4 : FeatureMaxPrivateElementSize<4>;
def FeatureMaxPrivateElementSize8 : FeatureMaxPrivateElementSize<8>;
def FeatureMaxPrivateElementSize16 : FeatureMaxPrivateElementSize<16>;
-def FeatureDumpCode : SubtargetFeature <"DumpCode",
- "DumpCode",
- "true",
- "Dump MachineInstrs in the CodeEmitter"
->;
-
def FeatureDumpCodeLower : SubtargetFeature <"dumpcode",
"DumpCode",
"true",
@@ -1321,74 +1160,64 @@ def FeatureEnablePRTStrictNull : SubtargetFeature<"enable-prt-strict-null",
// FIXME: moveToVALU should be able to handle converting addr64 MUBUF
// instructions.
-def FeatureFlatForGlobal : SubtargetFeature<"flat-for-global",
- "FlatForGlobal",
+def FeatureUseFlatForGlobal : SubtargetFeature<"flat-for-global",
+ "UseFlatForGlobal",
"true",
"Force to generate flat instruction for global"
>;
-def FeatureAutoWaitcntBeforeBarrier : SubtargetFeature <
- "auto-waitcnt-before-barrier",
- "AutoWaitcntBeforeBarrier",
- "true",
- "Hardware automatically inserts waitcnt before barrier"
+defm AutoWaitcntBeforeBarrier : AMDGPUSubtargetFeature <"auto-waitcnt-before-barrier",
+ "Hardware automatically inserts waitcnt before barrier",
+ /*GenPredicate=*/0
>;
-def FeatureBackOffBarrier : SubtargetFeature <"back-off-barrier",
- "BackOffBarrier",
- "true",
- "Hardware supports backing off s_barrier if an exception occurs"
+defm BackOffBarrier : AMDGPUSubtargetFeature <"back-off-barrier",
+ "Hardware supports backing off s_barrier if an exception occurs",
+ /*GenPredicate=*/0
>;
-def FeatureTrigReducedRange : SubtargetFeature<"trig-reduced-range",
- "HasTrigReducedRange",
- "true",
- "Requires use of fract on arguments to trig instructions"
+defm TrigReducedRange : AMDGPUSubtargetFeature<"trig-reduced-range",
+ "Requires use of fract on arguments to trig instructions",
+ /*GenPredicate=*/0
>;
-def FeatureKernargPreload : SubtargetFeature <"kernarg-preload",
- "KernargPreload",
- "true",
- "Hardware supports preloading of kernel arguments in user SGPRs."
+defm KernargPreload : AMDGPUSubtargetFeature <"kernarg-preload",
+ "Hardware supports preloading of kernel arguments in user SGPRs.",
+ /*GenPredicate=*/0
>;
// Alignment enforcement is controlled by a configuration register:
// SH_MEM_CONFIG.alignment_mode
-def FeatureUnalignedAccessMode : SubtargetFeature<"unaligned-access-mode",
- "UnalignedAccessMode",
- "true",
+defm UnalignedAccessMode : AMDGPUSubtargetFeature<"unaligned-access-mode",
"Enable unaligned global, local and region loads and stores if the hardware"
" supports it"
>;
-def FeaturePackedTID : SubtargetFeature<"packed-tid",
- "HasPackedTID",
- "true",
- "Workitem IDs are packed into v0 at kernel launch"
+defm PackedTID : AMDGPUSubtargetFeature<"packed-tid",
+ "Workitem IDs are packed into v0 at kernel launch",
+ /*GenPredicate=*/0
>;
-def FeatureArchitectedFlatScratch : SubtargetFeature<"architected-flat-scratch",
- "HasArchitectedFlatScratch",
- "true",
- "Flat Scratch register is a readonly SPI initialized architected register"
+defm ArchitectedFlatScratch : AMDGPUSubtargetFeature<"architected-flat-scratch",
+ "Flat Scratch register is a readonly SPI initialized architected register",
+ /*GenPredicate=*/0
>;
-def FeatureArchitectedSGPRs : SubtargetFeature<"architected-sgprs",
- "HasArchitectedSGPRs",
- "true",
- "Enable the architected SGPRs"
+defm ArchitectedSGPRs : AMDGPUSubtargetFeature<"architected-sgprs",
+ "Enable the architected SGPRs",
+ /*GenPredicate=*/0
>;
-def FeatureGDS : SubtargetFeature<"gds",
- "HasGDS",
- "true",
- "Has Global Data Share"
+defm GDS : AMDGPUSubtargetFeature<"gds",
+ "Has Global Data Share",
+ /*GenPredicate=*/1,
+ /*GenAssemblerPredicate=*/0
>;
-def FeatureGWS : SubtargetFeature<"gws",
- "HasGWS",
- "true",
- "Has Global Wave Sync"
+defm GWS : AMDGPUSubtargetFeature<"gws",
+ "Has Global Wave Sync",
+ /*GenPredicate=*/1,
+ /*GenAssemblerPredicate=*/0
>;
def FeatureRequiresCOV6 : SubtargetFeature<"requires-cov6",
@@ -1397,18 +1226,14 @@ def FeatureRequiresCOV6 : SubtargetFeature<"requires-cov6",
"Target Requires Code Object V6"
>;
-def FeatureXF32Insts : SubtargetFeature<"xf32-insts",
- "HasXF32Insts",
- "true",
- "Has instructions that support xf32 format, such as "
- "v_mfma_f32_16x16x8_xf32 and v_mfma_f32_32x32x4_xf32"
- >;
+defm XF32Insts : AMDGPUSubtargetFeature<"xf32-insts",
+ "Has instructions that support xf32 format, such as "
+ "v_mfma_f32_16x16x8_xf32 and v_mfma_f32_32x32x4_xf32"
+>;
-def FeatureGloballyAddressableScratch : SubtargetFeature<
- "globally-addressable-scratch",
- "HasGloballyAddressableScratch",
- "true",
- "FLAT instructions can access scratch memory for any thread in any wave"
+defm GloballyAddressableScratch : AMDGPUSubtargetFeature<"globally-addressable-scratch",
+ "FLAT instructions can access scratch memory for any thread in any wave",
+ /*GenPredicate=*/0
>;
// Enable the use of SCRATCH_STORE/LOAD_BLOCK instructions for saving and
@@ -1419,45 +1244,56 @@ def FeatureUseBlockVGPROpsForCSR : SubtargetFeature<"block-vgpr-csr",
"Use block load/store for VGPR callee saved registers"
>;
-def FeatureLshlAddU64Inst
- : SubtargetFeature<"lshl-add-u64-inst", "HasLshlAddU64Inst", "true",
- "Has v_lshl_add_u64 instruction">;
+defm LshlAddU64Inst : AMDGPUSubtargetFeature<"lshl-add-u64-inst",
+ "Has v_lshl_add_u64 instruction"
+>;
-def FeatureAddSubU64Insts
- : SubtargetFeature<"add-sub-u64-insts", "HasAddSubU64Insts", "true",
- "Has v_add_u64 and v_sub_u64 instructions">;
+defm AddSubU64Insts : AMDGPUSubtargetFeature<"add-sub-u64-insts",
+ "Has v_add_u64 and v_sub_u64 instructions"
+>;
-def FeatureMadU32Inst : SubtargetFeature<"mad-u32-inst", "HasMadU32Inst",
- "true", "Has v_mad_u32 instruction">;
+defm MadU32Inst : AMDGPUSubtargetFeature<"mad-u32-inst",
+ "Has v_mad_u32 instruction"
+>;
-def FeatureMemToLDSLoad : SubtargetFeature<"vmem-to-lds-load-insts",
- "HasVMemToLDSLoad",
- "true",
- "The platform has memory to lds instructions (global_load w/lds bit set, buffer_load w/lds bit set or global_load_lds. This does not include scratch_load_lds."
+defm AddMinMaxInsts : AMDGPUSubtargetFeature<"add-min-max-insts",
+ "Has v_add_{min|max}_{i|u}32 instructions"
>;
-def FeatureLdsBarrierArriveAtomic : SubtargetFeature< "lds-barrier-arrive-atomic",
- "HasLdsBarrierArriveAtomic",
- "true",
+defm PkAddMinMaxInsts : AMDGPUSubtargetFeature<"pk-add-min-max-insts",
+ "Has v_pk_add_{min|max}_{i|u}16 instructions"
+>;
+
+defm VMemToLDSLoad : AMDGPUSubtargetFeature<"vmem-to-lds-load-insts",
+ "The platform has memory to lds instructions (global_load w/lds bit set, buffer_load"
+ "w/lds bit set or global_load_lds. This does not include scratch_load_lds."
+>;
+
+defm LdsBarrierArriveAtomic : AMDGPUSubtargetFeature<"lds-barrier-arrive-atomic",
"Has LDS barrier-arrive atomic instructions"
>;
-def Feature45BitNumRecordsBufferResource : SubtargetFeature< "45-bit-num-records-buffer-resource",
- "Has45BitNumRecordsBufferResource",
- "true",
- "The buffer resource (V#) supports 45-bit num_records"
+defm 45BitNumRecordsBufferResource : AMDGPUSubtargetFeature<"45-bit-num-records-buffer-resource",
+ "The buffer resource (V#) supports 45-bit num_records",
+ /*GenPredicate=*/0
+>;
+
+defm Clusters : AMDGPUSubtargetFeature<"clusters",
+ "Has clusters of workgroups support",
+ /*GenPredicate=*/0
>;
-def FeatureClusters : SubtargetFeature< "clusters",
- "HasClusters",
+def FeatureWaitsBeforeSystemScopeStores : SubtargetFeature<
+ "waits-before-system-scope-stores",
+ "RequiresWaitsBeforeSystemScopeStores",
"true",
- "Has clusters of workgroups support"
+ "Target requires waits for loads and atomics before system scope stores"
>;
-// Dummy feature used to disable assembler instructions.
-def FeatureDisable : SubtargetFeature<"",
- "FeatureDisable","true",
- "Dummy feature to disable assembler instructions"
+def FeatureUseAddPC64Inst : SubtargetFeature<"use-add-pc64-inst",
+ "UseAddPC64Inst",
+ "true",
+ "Use s_add_pc_i64 instruction."
>;
//===----------------------------------------------------------------------===//
@@ -1475,7 +1311,8 @@ def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
FeatureTrigReducedRange, FeatureExtendedImageInsts, FeatureImageInsts,
FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts,
- FeatureVmemWriteVgprInOrder
+ FeatureVmemWriteVgprInOrder, FeatureCubeInsts, FeatureLerpInst,
+ FeatureSadInsts, FeatureCvtPkNormVOP2Insts
]
>;
@@ -1489,7 +1326,8 @@ def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
FeatureImageInsts, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts,
FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts,
- FeatureVmemWriteVgprInOrder
+ FeatureVmemWriteVgprInOrder, FeatureCubeInsts, FeatureLerpInst,
+ FeatureSadInsts, FeatureQsadInsts, FeatureCvtPkNormVOP2Insts
]
>;
@@ -1505,7 +1343,9 @@ def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts,
FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureFastDenormalF32,
FeatureUnalignedBufferAccess, FeatureImageInsts, FeatureGDS, FeatureGWS,
- FeatureDefaultComponentZero, FeatureVmemWriteVgprInOrder
+ FeatureDefaultComponentZero, FeatureVmemWriteVgprInOrder, FeatureCubeInsts,
+ FeatureLerpInst, FeatureSadInsts, FeatureQsadInsts,
+ FeatureCvtPkNormVOP2Insts
]
>;
@@ -1515,7 +1355,7 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
FeatureWavefrontSize64, FeatureFlatAddressSpace,
FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm,
- FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode,
+ FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3PInsts, FeatureVGPRIndexMode,
FeatureFastFMAF32, FeatureDPP, FeatureIntClamp,
FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst,
FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts,
@@ -1524,7 +1364,10 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK,
FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess,
FeatureUnalignedDSAccess, FeatureNegativeScratchOffsetBug, FeatureGWS,
- FeatureDefaultComponentZero,FeatureVmemWriteVgprInOrder, FeatureMemToLDSLoad
+ FeatureDefaultComponentZero,FeatureVmemWriteVgprInOrder, FeatureVMemToLDSLoad,
+ FeatureCubeInsts, FeatureLerpInst, FeatureSadInsts, FeatureQsadInsts,
+ FeatureCvtNormInsts, FeatureCvtPkNormVOP2Insts,
+ FeatureCvtPkNormVOP3Insts
]
>;
@@ -1534,7 +1377,7 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
FeatureFlatAddressSpace,
FeatureCIInsts, Feature16BitInsts,
FeatureSMemRealTime, FeatureInv2PiInlineImm,
- FeatureApertureRegs, FeatureGFX9Insts, FeatureGFX10Insts, FeatureVOP3P,
+ FeatureApertureRegs, FeatureGFX9Insts, FeatureGFX10Insts, FeatureVOP3PInsts,
FeatureMovrel, FeatureFastFMAF32, FeatureDPP, FeatureIntClamp,
FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst,
FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts,
@@ -1548,7 +1391,10 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
FeatureDefaultComponentZero, FeatureMaxHardClauseLength63,
FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts,
FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts,
- FeatureVmemWriteVgprInOrder, FeatureMemToLDSLoad
+ FeatureVmemWriteVgprInOrder, FeatureVMemToLDSLoad, FeatureCubeInsts,
+ FeatureLerpInst, FeatureSadInsts, FeatureQsadInsts,
+ FeatureCvtNormInsts, FeatureCvtPkNormVOP2Insts,
+ FeatureCvtPkNormVOP3Insts
]
>;
@@ -1559,7 +1405,7 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11",
FeatureInv2PiInlineImm, FeatureApertureRegs,
FeatureCIInsts, FeatureGFX8Insts, FeatureGFX9Insts, FeatureGFX10Insts,
FeatureGFX10_AEncoding, FeatureGFX10_BEncoding, FeatureGFX10_3Insts,
- FeatureGFX11Insts, FeatureVOP3P, FeatureVOPD, FeatureTrue16BitInsts,
+ FeatureGFX11Insts, FeatureVOP3PInsts, FeatureVOPDInsts, FeatureTrue16BitInsts,
FeatureMovrel, FeatureFastFMAF32, FeatureDPP, FeatureIntClamp,
FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts,
FeatureAddNoCarryInsts, FeatureFmaMixInsts,
@@ -1571,7 +1417,9 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11",
FeatureUnalignedDSAccess, FeatureGDS, FeatureGWS,
FeatureDefaultComponentZero, FeatureMaxHardClauseLength32,
FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts,
- FeatureVmemWriteVgprInOrder
+ FeatureVmemWriteVgprInOrder, FeatureCubeInsts, FeatureLerpInst,
+ FeatureSadInsts, FeatureQsadInsts, FeatureCvtNormInsts,
+ FeatureCvtPkNormVOP2Insts, FeatureCvtPkNormVOP3Insts
]
>;
@@ -1582,7 +1430,7 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12",
FeatureInv2PiInlineImm, FeatureApertureRegs,
FeatureCIInsts, FeatureGFX8Insts, FeatureGFX9Insts, FeatureGFX10Insts,
FeatureGFX10_AEncoding, FeatureGFX10_BEncoding, FeatureGFX10_3Insts,
- FeatureGFX11Insts, FeatureGFX12Insts, FeatureVOP3P, FeatureVOPD,
+ FeatureGFX11Insts, FeatureGFX12Insts, FeatureVOP3PInsts, FeatureVOPDInsts,
FeatureMovrel, FeatureFastFMAF32, FeatureDPP, FeatureIntClamp,
FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts,
FeatureAddNoCarryInsts, FeatureFmaMixInsts,
@@ -1599,6 +1447,29 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12",
]
>;
+def FeatureGFX13 : GCNSubtargetFeatureGeneration<"GFX13",
+ "gfx13",
+ [FeatureFP64, FeatureMIMG_R128,
+ FeatureFlatAddressSpace, Feature16BitInsts,
+ FeatureInv2PiInlineImm, FeatureApertureRegs,
+ FeatureCIInsts, FeatureGFX8Insts, FeatureGFX9Insts, FeatureGFX10Insts,
+ FeatureGFX10_AEncoding, FeatureGFX10_BEncoding, FeatureGFX10_3Insts,
+ FeatureGFX11Insts, FeatureGFX12Insts, FeatureGFX13Insts, FeatureVOP3PInsts,
+ FeatureVOPDInsts, FeatureMovrel, FeatureFastFMAF32, FeatureDPP, FeatureIntClamp,
+ FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts,
+ FeatureAddNoCarryInsts, FeatureFmaMixInsts,
+ FeatureNoSdstCMPX, FeatureVscnt,
+ FeatureVOP3Literal, FeatureDPP8,
+ FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
+ FeatureA16, FeatureFastDenormalF32, FeatureG16,
+ FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess, FeatureImageInsts,
+ FeatureUnalignedDSAccess, FeatureTrue16BitInsts,
+ FeatureDefaultComponentBroadcast, FeatureMaxHardClauseLength32,
+ FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts,
+ FeatureIEEEMinimumMaximumInsts, FeatureMinimum3Maximum3F32,
+ FeatureMinimum3Maximum3F16, FeatureAgentScopeFineGrainedRemoteMemoryAtomics
+ ]
+>;
//===----------------------------------------------------------------------===//
class FeatureSet<list<SubtargetFeature> Features_> {
@@ -1607,7 +1478,7 @@ class FeatureSet<list<SubtargetFeature> Features_> {
def FeatureISAVersion6_0_0 : FeatureSet<[FeatureSouthernIslands,
FeatureFastFMAF32,
- HalfRate64Ops,
+ FeatureHalfRate64Ops,
FeatureLDSBankCount32]>;
def FeatureISAVersion6_0_1 : FeatureSet<
@@ -1624,7 +1495,7 @@ def FeatureISAVersion7_0_0 : FeatureSet<
def FeatureISAVersion7_0_1 : FeatureSet<
[FeatureSeaIslands,
- HalfRate64Ops,
+ FeatureHalfRate64Ops,
FeatureLDSBankCount32,
FeatureFastFMAF32]>;
@@ -1653,7 +1524,7 @@ def FeatureISAVersion8_0_Common : FeatureSet<
def FeatureISAVersion8_0_1 : FeatureSet<
!listconcat(FeatureISAVersion8_0_Common.Features,
[FeatureFastFMAF32,
- HalfRate64Ops,
+ FeatureHalfRate64Ops,
FeatureSupportsXNACK])>;
def FeatureISAVersion8_0_2 : FeatureSet<
@@ -1724,7 +1595,7 @@ def FeatureISAVersion9_0_4 : FeatureSet<
def FeatureISAVersion9_0_6 : FeatureSet<
!listconcat(FeatureISAVersion9_0_Consumer_Common.Features,
- [HalfRate64Ops,
+ [FeatureHalfRate64Ops,
FeatureFmaMixInsts,
FeatureDLInsts,
FeatureDot1Insts,
@@ -1736,7 +1607,7 @@ def FeatureISAVersion9_0_6 : FeatureSet<
def FeatureISAVersion9_0_8 : FeatureSet<
!listconcat(FeatureISAVersion9_0_MI_Common.Features,
[FeatureGDS,
- HalfRate64Ops,
+ FeatureHalfRate64Ops,
FeatureDsSrc2Insts,
FeatureExtendedImageInsts,
FeatureAtomicBufferGlobalPkAddF16NoRtnInsts,
@@ -1757,7 +1628,7 @@ def FeatureISAVersion9_0_A : FeatureSet<
FeatureAtomicFaddRtnInsts,
FeatureAtomicBufferGlobalPkAddF16Insts,
FeaturePackedTID,
- FullRate64Ops,
+ FeatureFullRate64Ops,
FeatureBackOffBarrier,
FeatureKernargPreload,
FeatureAtomicFMinFMaxF64GlobalInsts,
@@ -1800,7 +1671,7 @@ def FeatureISAVersion9_4_Common : FeatureSet<
FeatureSupportsSRAMECC,
FeaturePackedTID,
FeatureArchitectedFlatScratch,
- FullRate64Ops,
+ FeatureFullRate64Ops,
FeatureBackOffBarrier,
FeatureKernargPreload,
FeatureAtomicFMinFMaxF64GlobalInsts,
@@ -1861,7 +1732,7 @@ def FeatureISAVersion10_1_Common : FeatureSet<
FeatureGetWaveIdInst,
FeatureMadMacF32Insts,
FeatureDsSrc2Insts,
- FeatureLdsMisalignedBug,
+ FeatureLDSMisalignedBug,
FeatureSupportsXNACK,
// gfx101x bugs
FeatureVcmpxPermlaneHazard,
@@ -2009,6 +1880,13 @@ def FeatureISAVersion11_5_3 : FeatureSet<
!listconcat(FeatureISAVersion11_5_Common.Features,
[])>;
+def FeatureISAVersion11_7_0 : FeatureSet<
+ !listconcat(FeatureISAVersion11_Common.Features,
+ [FeatureSALUFloatInsts,
+ FeatureDPPSrc1SGPR,
+ FeatureFP8ConversionInsts,
+ FeatureDot11Insts])>;
+
def FeatureISAVersion12 : FeatureSet<
[FeatureGFX12,
FeatureBackOffBarrier,
@@ -2042,20 +1920,28 @@ def FeatureISAVersion12 : FeatureSet<
FeatureVcmpxPermlaneHazard,
FeatureSALUFloatInsts,
FeaturePseudoScalarTrans,
- FeatureHasRestrictedSOffset,
+ FeatureRestrictedSOffset,
FeatureScalarDwordx3Loads,
FeatureDPPSrc1SGPR,
FeatureMaxHardClauseLength32,
Feature1_5xVGPRs,
FeatureMemoryAtomicFAddF32DenormalSupport,
- FeatureBVHDualAndBVH8Insts
+ FeatureBVHDualAndBVH8Insts,
+ FeatureWaitsBeforeSystemScopeStores,
+ FeatureD16Writes32BitVgpr,
+ FeatureCubeInsts,
+ FeatureLerpInst,
+ FeatureSadInsts,
+ FeatureQsadInsts,
+ FeatureCvtNormInsts,
+ FeatureCvtPkNormVOP2Insts,
+ FeatureCvtPkNormVOP3Insts
]>;
-def FeatureISAVersion12_50 : FeatureSet<
+def FeatureISAVersion12_50_Common : FeatureSet<
[FeatureGFX12,
FeatureGFX1250Insts,
FeatureRequiresAlignedVGPRs,
- FeatureAddressableLocalMemorySize327680,
FeatureCuMode,
Feature1024AddressableVGPRs,
Feature64BitLiterals,
@@ -2084,7 +1970,7 @@ def FeatureISAVersion12_50 : FeatureSet<
FeatureVcmpxPermlaneHazard,
FeatureSALUFloatInsts,
FeaturePseudoScalarTrans,
- FeatureHasRestrictedSOffset,
+ FeatureRestrictedSOffset,
FeatureScalarDwordx3Loads,
FeatureDPPSrc1SGPR,
FeatureBitOp3Insts,
@@ -2115,22 +2001,107 @@ def FeatureISAVersion12_50 : FeatureSet<
FeatureLshlAddU64Inst,
FeatureAddSubU64Insts,
FeatureMadU32Inst,
+ FeatureAddMinMaxInsts,
+ FeaturePkAddMinMaxInsts,
FeatureLdsBarrierArriveAtomic,
FeatureSetPrioIncWgInst,
+ FeatureSWakeupBarrier,
Feature45BitNumRecordsBufferResource,
FeatureSupportsXNACK,
FeatureXNACK,
FeatureClusters,
+ FeatureD16Writes32BitVgpr,
+ FeatureMcastLoadInsts
]>;
+def FeatureISAVersion12_50 : FeatureSet<
+ !listconcat(FeatureISAVersion12_50_Common.Features,
+ [FeatureAddressableLocalMemorySize327680,
+ FeatureSetregVGPRMSBFixup,
+ FeatureCubeInsts,
+ FeatureLerpInst,
+ FeatureSadInsts,
+ FeatureQsadInsts,
+ FeatureCvtNormInsts,
+ FeatureCvtPkNormVOP2Insts,
+ FeatureCvtPkNormVOP3Insts])>;
+
def FeatureISAVersion12_51 : FeatureSet<
- !listconcat(FeatureISAVersion12_50.Features,
- [FeatureDPALU_DPP])>;
+ !listconcat(FeatureISAVersion12_50_Common.Features,
+ [FeatureAddressableLocalMemorySize327680,
+ FeatureDPALU_DPP,
+ FeatureCubeInsts,
+ FeatureLerpInst,
+ FeatureSadInsts,
+ FeatureQsadInsts,
+ FeatureCvtNormInsts,
+ FeatureCvtPkNormVOP2Insts,
+ FeatureCvtPkNormVOP3Insts])>;
def FeatureISAVersion12_Generic: FeatureSet<
!listconcat(FeatureISAVersion12.Features,
[FeatureRequiresCOV6])>;
+def FeatureISAVersion13 : FeatureSet<
+ [FeatureGFX13,
+ FeatureGFX1250Insts,
+ FeatureAddressableLocalMemorySize65536,
+ Feature64BitLiterals,
+ FeatureLDSBankCount32,
+ FeatureDLInsts,
+ FeatureFmacF64Inst,
+ FeatureDot7Insts,
+ FeatureDot8Insts,
+ FeatureNSAEncoding,
+ FeaturePartialNSAEncoding,
+ FeatureShaderCyclesRegister,
+ FeatureArchitectedFlatScratch,
+ FeatureArchitectedSGPRs,
+ FeatureAtomicFaddRtnInsts,
+ FeatureAtomicFaddNoRtnInsts,
+ FeatureAtomicDsPkAdd16Insts,
+ FeatureAtomicFlatPkAdd16Insts,
+ FeatureAtomicBufferGlobalPkAddF16Insts,
+ FeatureAtomicGlobalPkAddBF16Inst,
+ FeatureAtomicBufferPkAddBF16Inst,
+ FeatureFlatAtomicFaddF32Inst,
+ FeatureFP8ConversionInsts,
+ FeaturePackedTID,
+ FeatureVcmpxPermlaneHazard,
+ FeatureSALUFloatInsts,
+ FeaturePseudoScalarTrans,
+ FeatureRestrictedSOffset,
+ FeatureScalarDwordx3Loads,
+ FeatureDPPSrc1SGPR,
+ FeatureBitOp3Insts,
+ FeatureTanhInsts,
+ FeatureTensorCvtLutInsts,
+ FeatureTransposeLoadF4F6Insts,
+ Feature1_5xVGPRs,
+ FeatureBF16TransInsts,
+ FeatureBF16ConversionInsts,
+ FeatureBF16PackedInsts,
+ FeaturePrngInst,
+ FeaturePermlane16Swap,
+ FeatureAshrPkInsts,
+ FeatureAtomicFMinFMaxF64GlobalInsts,
+ FeatureAtomicFMinFMaxF64FlatInsts,
+ FeatureFmaMixBF16Insts,
+ FeatureGloballyAddressableScratch,
+ FeatureCvtPkF16F32Inst,
+ FeatureF16BF16ToFP6BF6ConversionScaleInsts,
+ FeatureIEEEMinimumMaximumInsts,
+ FeatureSWakeupBarrier,
+ FeatureClusters,
+ FeatureCubeInsts,
+ FeatureLerpInst,
+ FeatureSadInsts,
+ FeatureQsadInsts,
+ FeatureCvtNormInsts,
+ FeatureCvtPkNormVOP2Insts,
+ FeatureCvtPkNormVOP3Insts,
+]>;
+
//===----------------------------------------------------------------------===//
def AMDGPUInstrInfo : InstrInfo {
@@ -2303,6 +2274,11 @@ def isGFX8GFX9GFX10GFX11 :
"Subtarget->getGeneration() == AMDGPUSubtarget::GFX11">,
AssemblerPredicate<(all_of FeatureGFX8Insts, (not FeatureGFX12Insts))>;
+def isGFX8GFX9GFX10GFX11GFX12 :
+ Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&"
+ "Subtarget->getGeneration() < AMDGPUSubtarget::GFX13">,
+ AssemblerPredicate<(all_of FeatureGFX8Insts, (not FeatureGFX13Insts))>;
+
def isGFX7Plus :
Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS">,
AssemblerPredicate<(all_of FeatureCIInsts)>;
@@ -2360,18 +2336,6 @@ def isNotGFX940Plus :
Predicate<"!Subtarget->hasGFX940Insts()">,
AssemblerPredicate<(all_of (not FeatureGFX940Insts))>;
-def HasGFX950Insts :
- Predicate<"Subtarget->hasGFX950Insts()">,
- AssemblerPredicate<(all_of FeatureGFX950Insts)>;
-
-def HasPermlane16Swap :
- Predicate<"Subtarget->hasPermlane16Swap()">,
- AssemblerPredicate<(all_of FeaturePermlane16Swap)>;
-
-def HasPermlane32Swap :
- Predicate<"Subtarget->hasPermlane32Swap()">,
- AssemblerPredicate<(all_of FeaturePermlane32Swap)>;
-
def isGFX8GFX9NotGFX940 :
Predicate<"!Subtarget->hasGFX940Insts() &&"
"(Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||"
@@ -2425,9 +2389,14 @@ def isGFX11Plus :
Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11">,
AssemblerPredicate<(all_of FeatureGFX11Insts)>;
+def isGFX11PlusNot12_50 :
+ Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11 &&"
+ "(Subtarget->getGeneration() >= AMDGPUSubtarget::GFX13 || !Subtarget->hasGFX1250Insts())">,
+ AssemblerPredicate<(all_of FeatureGFX11Insts, (any_of FeatureGFX13Insts, (not FeatureGFX1250Insts)))>;
+
def isGFX12Only :
Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::GFX12">,
- AssemblerPredicate<(all_of FeatureGFX12Insts)>;
+ AssemblerPredicate<(all_of FeatureGFX12Insts, (not FeatureGFX13Insts))>;
def isGFX12Not12_50 :
Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::GFX12 && !Subtarget->hasGFX1250Insts()">,
@@ -2438,12 +2407,13 @@ def isGFX12Plus :
AssemblerPredicate<(all_of FeatureGFX12Insts)>;
def isGFX12PlusNot12_50 :
- Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12 && !Subtarget->hasGFX1250Insts()">,
- AssemblerPredicate<(all_of FeatureGFX12Insts, (not FeatureGFX1250Insts))>;
+ Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12 &&"
+ "(Subtarget->getGeneration() >= AMDGPUSubtarget::GFX13 || !Subtarget->hasGFX1250Insts())">,
+ AssemblerPredicate<(all_of FeatureGFX12Insts, (any_of FeatureGFX13Insts, (not FeatureGFX1250Insts)))>;
def isGFX125xOnly :
- Predicate<"Subtarget->hasGFX1250Insts()">,
- AssemblerPredicate<(all_of FeatureGFX1250Insts)>;
+ Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::GFX12 && Subtarget->hasGFX1250Insts()">,
+ AssemblerPredicate<(all_of FeatureGFX1250Insts, (not FeatureGFX13Insts))>;
def isGFX1250Plus :
Predicate<"Subtarget->hasGFX1250Insts()">,
@@ -2454,63 +2424,27 @@ def isNotGFX1250Plus :
AssemblerPredicate<(all_of (not FeatureGFX1250Insts))>;
def isGFX940orGFX1250 :
- Predicate<"Subtarget->hasGFX940Insts() || Subtarget->hasGFX1250Insts()">,
- AssemblerPredicate<(any_of FeatureGFX940Insts, FeatureGFX1250Insts)>;
-
-def HasIEEEMinimumMaximumInsts :
- Predicate<"Subtarget->hasIEEEMinimumMaximumInsts()">,
- AssemblerPredicate<(all_of FeatureIEEEMinimumMaximumInsts)>;
-
-def HasMinimum3Maximum3F32 :
- Predicate<"Subtarget->hasMinimum3Maximum3F32()">,
- AssemblerPredicate<(all_of FeatureMinimum3Maximum3F32)>;
-
-def HasMinimum3Maximum3F16 :
- Predicate<"Subtarget->hasMinimum3Maximum3F16()">,
- AssemblerPredicate<(all_of FeatureMinimum3Maximum3F16)>;
-
-def HasMin3Max3PKF16 :
- Predicate<"Subtarget->hasMin3Max3PKF16()">,
- AssemblerPredicate<(all_of FeatureMin3Max3PKF16)>;
-
-def HasMinimum3Maximum3PKF16 :
- Predicate<"Subtarget->hasMinimum3Maximum3PKF16()">,
- AssemblerPredicate<(all_of FeatureMinimum3Maximum3PKF16)>;
-
+ Predicate<"Subtarget->hasGFX940Insts() ||"
+ "(Subtarget->hasGFX1250Insts() && !Subtarget->hasGFX13Insts())">,
+ AssemblerPredicate<(any_of FeatureGFX940Insts,
+ (all_of FeatureGFX1250Insts, (not FeatureGFX13Insts)))>;
-def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">,
- AssemblerPredicate<(all_of FeatureFlatAddressSpace)>;
+def isGFX13Only :
+ Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::GFX13">,
+ AssemblerPredicate<(all_of FeatureGFX13Insts)>;
-def HasFlatBufferGlobalAtomicFaddF64Inst :
- Predicate<"Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst()">,
- AssemblerPredicate<(any_of FeatureFlatBufferGlobalAtomicFaddF64Inst)>;
+def isGFX13Plus :
+ Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX13">,
+ AssemblerPredicate<(all_of FeatureGFX13Insts)>;
-def HasAtomicFMinFMaxF32GlobalInsts :
- Predicate<"Subtarget->hasAtomicFMinFMaxF32GlobalInsts()">,
- AssemblerPredicate<(any_of FeatureAtomicFMinFMaxF32GlobalInsts)>;
-
-def HasAtomicFMinFMaxF64GlobalInsts :
- Predicate<"Subtarget->hasAtomicFMinFMaxF64GlobalInsts()">,
- AssemblerPredicate<(any_of FeatureAtomicFMinFMaxF64GlobalInsts)>;
-
-def HasAtomicFMinFMaxF32FlatInsts :
- Predicate<"Subtarget->hasAtomicFMinFMaxF32FlatInsts()">,
- AssemblerPredicate<(any_of FeatureAtomicFMinFMaxF32FlatInsts)>;
-
-def HasAtomicFMinFMaxF64FlatInsts :
- Predicate<"Subtarget->hasAtomicFMinFMaxF64FlatInsts()">,
- AssemblerPredicate<(any_of FeatureAtomicFMinFMaxF64FlatInsts)>;
+def HasAtomicCondSubClampFlatInsts :
+ Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12">,
+ AssemblerPredicate<(all_of FeatureGFX12Insts)>;
def HasLdsAtomicAddF64 :
Predicate<"Subtarget->hasLdsAtomicAddF64()">,
AssemblerPredicate<(any_of FeatureGFX90AInsts, FeatureGFX1250Insts)>;
-def HasFlatGlobalInsts : Predicate<"Subtarget->hasFlatGlobalInsts()">,
- AssemblerPredicate<(all_of FeatureFlatGlobalInsts)>;
-def HasFlatScratchInsts : Predicate<"Subtarget->hasFlatScratchInsts()">,
- AssemblerPredicate<(all_of FeatureFlatScratchInsts)>;
-def HasScalarFlatScratchInsts : Predicate<"Subtarget->hasScalarFlatScratchInsts()">,
- AssemblerPredicate<(all_of FeatureScalarFlatScratchInsts)>;
def HasD16LoadStore : Predicate<"Subtarget->hasD16LoadStore()">,
AssemblerPredicate<(all_of FeatureGFX9Insts)>;
@@ -2519,24 +2453,17 @@ def HasFlatScratchSTMode : Predicate<"Subtarget->hasFlatScratchSTMode()">,
def HasFlatScratchSVSMode : Predicate<"Subtarget->hasFlatScratchSVSMode()">,
AssemblerPredicate<(any_of FeatureGFX940Insts, FeatureGFX11Insts)>;
-def HasFlatGVSMode : Predicate<"Subtarget->hasFlatGVSMode()">,
- AssemblerPredicate<(all_of FeatureFlatGVSMode)>;
-
def HasGFX10_AEncoding : Predicate<"Subtarget->hasGFX10_AEncoding()">,
AssemblerPredicate<(all_of FeatureGFX10_AEncoding)>;
def HasGFX10_BEncoding : Predicate<"Subtarget->hasGFX10_BEncoding()">,
AssemblerPredicate<(all_of FeatureGFX10_BEncoding)>;
-def HasUnpackedD16VMem : Predicate<"Subtarget->hasUnpackedD16VMem()">,
- AssemblerPredicate<(all_of FeatureUnpackedD16VMem)>;
def HasPackedD16VMem : Predicate<"!Subtarget->hasUnpackedD16VMem()">,
AssemblerPredicate<(all_of (not FeatureUnpackedD16VMem))>;
-def HasRestrictedSOffset : Predicate<"Subtarget->hasRestrictedSOffset()">,
- AssemblerPredicate<(all_of FeatureHasRestrictedSOffset)>;
def HasUnrestrictedSOffset : Predicate<"!Subtarget->hasRestrictedSOffset()">,
- AssemblerPredicate<(all_of (not FeatureHasRestrictedSOffset))>;
+ AssemblerPredicate<(all_of (not FeatureRestrictedSOffset))>;
def D16PreservesUnusedBits :
Predicate<"Subtarget->d16PreservesUnusedBits()">,
@@ -2552,7 +2479,7 @@ def HasFormattedMUBUFInsts : Predicate<"Subtarget->hasFormattedMUBUFInsts()">,
AssemblerPredicate<(all_of (not FeatureGFX1250Insts))>;
def HasExportInsts : Predicate<"Subtarget->hasExportInsts()">,
- AssemblerPredicate<(all_of (not FeatureGFX90AInsts), (not FeatureGFX1250Insts))>;
+ AssemblerPredicate<(any_of FeatureGFX13Insts, (all_of (not FeatureGFX90AInsts), (not FeatureGFX1250Insts)))>;
def HasVINTERPEncoding : Predicate<"Subtarget->hasVINTERPEncoding()">,
AssemblerPredicate<(all_of FeatureGFX11Insts, (not FeatureGFX1250Insts))>;
@@ -2563,18 +2490,10 @@ def HasDSAddTid : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9
def HasLDSFPAtomicAddF32 : Predicate<"Subtarget->hasLDSFPAtomicAddF32()">,
AssemblerPredicate<(all_of FeatureGFX8Insts)>;
-def HasAddNoCarryInsts : Predicate<"Subtarget->hasAddNoCarry()">,
- AssemblerPredicate<(all_of FeatureAddNoCarryInsts)>;
-
-def NotHasAddNoCarryInsts : Predicate<"!Subtarget->hasAddNoCarry()">;
+def NotHasAddNoCarryInsts : Predicate<"!Subtarget->hasAddNoCarryInsts()">;
def HasXNACKEnabled : Predicate<"Subtarget->isXNACKEnabled()">;
-def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">,
- AssemblerPredicate<(all_of Feature16BitInsts)>;
-
-def HasTrue16BitInsts : Predicate<"Subtarget->hasTrue16BitInsts()">,
- AssemblerPredicate<(all_of FeatureTrue16BitInsts)>;
def NotHasTrue16BitInsts : True16PredicateClass<"!Subtarget->hasTrue16BitInsts()">,
AssemblerPredicate<(all_of (not FeatureTrue16BitInsts))>;
@@ -2588,30 +2507,14 @@ def NotUseRealTrue16Insts : True16PredicateClass<"!Subtarget->useRealTrue16Insts
AssemblerPredicate<(not (all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts))>;
def UseFakeTrue16Insts : True16PredicateClass<"Subtarget->hasTrue16BitInsts() && "
"!Subtarget->useRealTrue16Insts()">,
- AssemblerPredicate<(all_of FeatureTrue16BitInsts)>;
- // FIXME When we default to RealTrue16 instead of Fake, change the line as follows.
- // AssemblerPredicate<(all_of FeatureTrue16BitInsts, (not FeatureRealTrue16Insts))>;
+ AssemblerPredicate<(all_of FeatureTrue16BitInsts, (not FeatureRealTrue16Insts))>;
def UseTrue16WithSramECC : True16PredicateClass<"Subtarget->useRealTrue16Insts() && "
"!Subtarget->d16PreservesUnusedBits()">;
-def HasD16Writes32BitVgpr: Predicate<"Subtarget->hasD16Writes32BitVgpr()">,
- AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts, FeatureD16Writes32BitVgpr)>;
def NotHasD16Writes32BitVgpr: Predicate<"!Subtarget->hasD16Writes32BitVgpr()">,
AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts, (not FeatureD16Writes32BitVgpr))>;
-def HasBF16TransInsts : Predicate<"Subtarget->hasBF16TransInsts()">,
- AssemblerPredicate<(all_of FeatureBF16TransInsts)>;
-
-def HasBF16ConversionInsts : Predicate<"Subtarget->hasBF16ConversionInsts()">,
- AssemblerPredicate<(all_of FeatureBF16ConversionInsts)>;
-
-def HasBF16PackedInsts : Predicate<"Subtarget->hasBF16PackedInsts()">,
- AssemblerPredicate<(all_of FeatureBF16PackedInsts)>;
-
-def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">,
- AssemblerPredicate<(all_of FeatureVOP3P)>;
-
def NotHasMed3_16 : Predicate<"!Subtarget->hasMed3_16()">;
def HasMed3_16 : Predicate<"Subtarget->hasMed3_16()">;
@@ -2620,8 +2523,6 @@ def NotHasMinMaxDenormModes : Predicate<"!Subtarget->supportsMinMaxDenormModes()
def HasFminFmaxLegacy : Predicate<"Subtarget->hasFminFmaxLegacy()">;
-def HasSDWA : Predicate<"Subtarget->hasSDWA()">;
-
def HasSDWA8 : Predicate<"Subtarget->hasSDWA()">,
AssemblerPredicate<(all_of (not FeatureGFX9Insts), FeatureSDWA)>;
@@ -2639,12 +2540,6 @@ def HasDPP : Predicate<"Subtarget->hasDPP()">,
def HasDPP8 : Predicate<"Subtarget->hasDPP8()">,
AssemblerPredicate<(all_of (not FeatureGCN3Encoding), FeatureGFX10Insts, FeatureDPP8)>;
-def HasDPALU_DPP : Predicate<"Subtarget->hasDPALU_DPP()">,
- AssemblerPredicate<(all_of FeatureDPALU_DPP)>;
-
-def HasPackedFP32Ops : Predicate<"Subtarget->hasPackedFP32Ops()">,
- AssemblerPredicate<(all_of FeaturePackedFP32Ops)>;
-
def HasPkMovB32 : Predicate<"Subtarget->hasPkMovB32()">,
AssemblerPredicate<(all_of FeatureGFX90AInsts)>;
@@ -2656,14 +2551,6 @@ def HasFmaakFmamkF64Insts :
Predicate<"Subtarget->hasFmaakFmamkF64Insts()">,
AssemblerPredicate<(any_of FeatureGFX1250Insts)>;
-def HasAddMinMaxInsts :
- Predicate<"Subtarget->hasAddMinMaxInsts()">,
- AssemblerPredicate<(any_of FeatureGFX1250Insts)>;
-
-def HasPkAddMinMaxInsts :
- Predicate<"Subtarget->hasPkAddMinMaxInsts()">,
- AssemblerPredicate<(any_of FeatureGFX1250Insts)>;
-
def HasPkMinMax3Insts :
Predicate<"Subtarget->hasPkMinMax3Insts()">,
AssemblerPredicate<(any_of FeatureGFX1250Insts)>;
@@ -2672,295 +2559,92 @@ def HasSGetShaderCyclesInst :
Predicate<"Subtarget->hasSGetShaderCyclesInst()">,
AssemblerPredicate<(any_of FeatureGFX1250Insts)>;
-def HasImageInsts : Predicate<"Subtarget->hasImageInsts()">,
- AssemblerPredicate<(all_of FeatureImageInsts)>;
-
-def HasExtendedImageInsts : Predicate<"Subtarget->hasExtendedImageInsts()">,
- AssemblerPredicate<(all_of FeatureExtendedImageInsts)>;
-
-def HasR128A16 : Predicate<"Subtarget->hasR128A16()">,
- AssemblerPredicate<(all_of FeatureR128A16)>;
-
-def HasA16 : Predicate<"Subtarget->hasA16()">,
- AssemblerPredicate<(all_of FeatureA16)>;
-
-def HasG16 : Predicate<"Subtarget->hasG16()">,
- AssemblerPredicate<(all_of FeatureG16)>;
-
def HasDPP16 : Predicate<"Subtarget->hasDPP()">,
AssemblerPredicate<(all_of (not FeatureGCN3Encoding), FeatureGFX10Insts, FeatureDPP)>;
-def HasIntClamp : Predicate<"Subtarget->hasIntClamp()">,
- AssemblerPredicate<(all_of FeatureIntClamp)>;
-
-def HasMadMixInsts : Predicate<"Subtarget->hasMadMixInsts()">,
- AssemblerPredicate<(all_of FeatureMadMixInsts)>;
-
-def HasScalarStores : Predicate<"Subtarget->hasScalarStores()">,
- AssemblerPredicate<(all_of FeatureScalarStores)>;
-
-def HasScalarAtomics : Predicate<"Subtarget->hasScalarAtomics()">,
- AssemblerPredicate<(all_of FeatureScalarAtomics)>;
-
-def HasNoSdstCMPX : Predicate<"Subtarget->hasNoSdstCMPX()">,
- AssemblerPredicate<(all_of FeatureNoSdstCMPX)>;
-
def HasSdstCMPX : Predicate<"!Subtarget->hasNoSdstCMPX()">,
AssemblerPredicate<(all_of (not FeatureNoSdstCMPX))>;
def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">;
def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">;
-def HasVGPRIndexMode : Predicate<"Subtarget->hasVGPRIndexMode()">,
- AssemblerPredicate<(all_of FeatureVGPRIndexMode)>;
-def HasMovrel : Predicate<"Subtarget->hasMovrel()">,
- AssemblerPredicate<(all_of FeatureMovrel)>;
-
-def HasFmaMixInsts : Predicate<"Subtarget->hasFmaMixInsts()">,
- AssemblerPredicate<(all_of FeatureFmaMixInsts)>;
-
-def HasFmaMixBF16Insts : Predicate<"Subtarget->hasFmaMixBF16Insts()">,
- AssemblerPredicate<(all_of FeatureFmaMixBF16Insts)>;
-
-def HasDLInsts : Predicate<"Subtarget->hasDLInsts()">,
- AssemblerPredicate<(all_of FeatureDLInsts)>;
-
-def HasFmacF64Inst : Predicate<"Subtarget->hasFmacF64Inst()">,
- AssemblerPredicate<(all_of FeatureFmacF64Inst)>;
-
-def HasDot1Insts : Predicate<"Subtarget->hasDot1Insts()">,
- AssemblerPredicate<(all_of FeatureDot1Insts)>;
-
-def HasDot2Insts : Predicate<"Subtarget->hasDot2Insts()">,
- AssemblerPredicate<(all_of FeatureDot2Insts)>;
-
-def HasDot3Insts : Predicate<"Subtarget->hasDot3Insts()">,
- AssemblerPredicate<(all_of FeatureDot3Insts)>;
-
-def HasDot4Insts : Predicate<"Subtarget->hasDot4Insts()">,
- AssemblerPredicate<(all_of FeatureDot4Insts)>;
-
-def HasDot5Insts : Predicate<"Subtarget->hasDot5Insts()">,
- AssemblerPredicate<(all_of FeatureDot5Insts)>;
-
-def HasDot6Insts : Predicate<"Subtarget->hasDot6Insts()">,
- AssemblerPredicate<(all_of FeatureDot6Insts)>;
-
-def HasDot7Insts : Predicate<"Subtarget->hasDot7Insts()">,
- AssemblerPredicate<(all_of FeatureDot7Insts)>;
-
-def HasDot8Insts : Predicate<"Subtarget->hasDot8Insts()">,
- AssemblerPredicate<(all_of FeatureDot8Insts)>;
-
-def HasDot9Insts : Predicate<"Subtarget->hasDot9Insts()">,
- AssemblerPredicate<(all_of FeatureDot9Insts)>;
-
-def HasDot10Insts : Predicate<"Subtarget->hasDot10Insts()">,
- AssemblerPredicate<(all_of FeatureDot10Insts)>;
-
-def HasDot11Insts : Predicate<"Subtarget->hasDot11Insts()">,
- AssemblerPredicate<(all_of FeatureDot11Insts)>;
-
-def HasDot12Insts : Predicate<"Subtarget->hasDot12Insts()">,
- AssemblerPredicate<(all_of FeatureDot12Insts)>;
-
-def HasDot13Insts : Predicate<"Subtarget->hasDot13Insts()">,
- AssemblerPredicate<(all_of FeatureDot13Insts)>;
-
-def HasGetWaveIdInst : Predicate<"Subtarget->hasGetWaveIdInst()">,
- AssemblerPredicate<(all_of FeatureGetWaveIdInst)>;
-
-def HasMAIInsts : Predicate<"Subtarget->hasMAIInsts()">,
- AssemblerPredicate<(all_of FeatureMAIInsts)>;
def NotHasMAIInsts : Predicate<"!Subtarget->hasMAIInsts()">,
AssemblerPredicate<(all_of (not FeatureMAIInsts))>;
-def HasSMemRealTime : Predicate<"Subtarget->hasSMemRealTime()">,
- AssemblerPredicate<(all_of FeatureSMemRealTime)>;
-
-def HasSMemTimeInst : Predicate<"Subtarget->hasSMemTimeInst()">,
- AssemblerPredicate<(all_of FeatureSMemTimeInst)>;
-
-def HasShaderCyclesRegister : Predicate<"Subtarget->hasShaderCyclesRegister()">,
- AssemblerPredicate<(all_of FeatureShaderCyclesRegister)>;
-
-def HasShaderCyclesHiLoRegisters : Predicate<"Subtarget->hasShaderCyclesHiLoRegisters()">;
-
-def HasFP8Insts : Predicate<"Subtarget->hasFP8Insts()">,
- AssemblerPredicate<(all_of FeatureFP8Insts)>;
-
-def HasFP8ConversionInsts : Predicate<"Subtarget->hasFP8ConversionInsts()">,
- AssemblerPredicate<(all_of FeatureFP8ConversionInsts)>;
-
-def HasFP8E5M3Insts : Predicate<"Subtarget->hasFP8E5M3Insts()">,
- AssemblerPredicate<(all_of FeatureFP8E5M3Insts)>;
-
def NotHasFP8E5M3Insts : Predicate<"!Subtarget->hasFP8E5M3Insts()">,
AssemblerPredicate<(all_of (not FeatureFP8E5M3Insts))>;
-def HasPkFmacF16Inst : Predicate<"Subtarget->hasPkFmacF16Inst()">,
- AssemblerPredicate<(all_of FeaturePkFmacF16Inst)>;
-
-def HasMadMacF32Insts : Predicate<"Subtarget->hasMadMacF32Insts()">,
- AssemblerPredicate<(all_of FeatureMadMacF32Insts)>;
-
def HasFmaLegacy32 : Predicate<"Subtarget->hasGFX10_3Insts()">,
AssemblerPredicate<(any_of FeatureGFX10_3Insts)>;
-def HasAtomicDsPkAdd16Insts : Predicate<"Subtarget->hasAtomicDsPkAdd16Insts()">,
- AssemblerPredicate<(any_of FeatureAtomicDsPkAdd16Insts)>;
+def HasFmacLegacy32 : Predicate<"Subtarget->hasGFX10_3Insts() && Subtarget->getGeneration() < AMDGPUSubtarget::GFX12">,
+ AssemblerPredicate<(all_of FeatureGFX10_3Insts, (not FeatureGFX12Insts))>;
-def HasAtomicFlatPkAdd16Insts : Predicate<"Subtarget->hasAtomicFlatPkAdd16Insts()">,
- AssemblerPredicate<(any_of FeatureAtomicFlatPkAdd16Insts)>;
+def HasAtomicDsCondSubClampInsts :
+ Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12">,
+ AssemblerPredicate<(all_of FeatureGFX12Insts)>;
-def HasAtomicFaddRtnInsts : Predicate<"Subtarget->hasAtomicFaddRtnInsts()">,
- AssemblerPredicate<(all_of FeatureAtomicFaddRtnInsts)>;
-def HasAtomicFaddNoRtnInsts : Predicate<"Subtarget->hasAtomicFaddNoRtnInsts()">,
- AssemblerPredicate<(all_of FeatureAtomicFaddNoRtnInsts)>;
def HasAtomicBufferGlobalPkAddF16NoRtnInsts
: Predicate<"Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() || Subtarget->hasAtomicBufferGlobalPkAddF16Insts()">,
AssemblerPredicate<(any_of FeatureAtomicBufferGlobalPkAddF16NoRtnInsts, FeatureAtomicBufferGlobalPkAddF16Insts)>;
-def HasAtomicBufferGlobalPkAddF16Insts
- : Predicate<"Subtarget->hasAtomicBufferGlobalPkAddF16Insts()">,
- AssemblerPredicate<(all_of FeatureAtomicBufferGlobalPkAddF16Insts)>;
-def HasAtomicGlobalPkAddBF16Inst
- : Predicate<"Subtarget->hasAtomicGlobalPkAddBF16Inst()">,
- AssemblerPredicate<(all_of FeatureAtomicGlobalPkAddBF16Inst)>;
-def HasAtomicBufferPkAddBF16Inst
- : Predicate<"Subtarget->hasAtomicBufferPkAddBF16Inst()">,
- AssemblerPredicate<(all_of FeatureAtomicBufferPkAddBF16Inst)>;
-def HasFlatAtomicFaddF32Inst
- : Predicate<"Subtarget->hasFlatAtomicFaddF32Inst()">,
- AssemblerPredicate<(all_of FeatureFlatAtomicFaddF32Inst)>;
-
-def HasDefaultComponentZero
- : Predicate<"Subtarget->hasDefaultComponentZero()">,
- AssemblerPredicate<(all_of FeatureDefaultComponentZero)>;
-def HasDefaultComponentBroadcast
- : Predicate<"Subtarget->hasDefaultComponentBroadcast()">,
- AssemblerPredicate<(all_of FeatureDefaultComponentBroadcast)>;
-
-def HasDsSrc2Insts : Predicate<"!Subtarget->hasDsSrc2Insts()">,
- AssemblerPredicate<(all_of FeatureDsSrc2Insts)>;
def HasAddPC64Inst : Predicate<"Subtarget->hasAddPC64Inst()">,
AssemblerPredicate<(any_of FeatureGFX1250Insts)>;
-def EnableFlatScratch : Predicate<"Subtarget->enableFlatScratch()">;
-
-def DisableFlatScratch : Predicate<"!Subtarget->enableFlatScratch()">;
-
-def HasUnalignedAccessMode : Predicate<"Subtarget->hasUnalignedAccessMode()">,
- AssemblerPredicate<(all_of FeatureUnalignedAccessMode)>;
+def HasFlatScratchEnabled : Predicate<"Subtarget->hasFlatScratchEnabled()">;
-def HasMADIntraFwdBug : Predicate<"Subtarget->hasMADIntraFwdBug()">;
+def NotHasFlatScratchEnabled : Predicate<"!Subtarget->hasFlatScratchEnabled()">;
-def HasNotMADIntraFwdBug : Predicate<"!Subtarget->hasMADIntraFwdBug()">;
-
-def HasSALUFloatInsts : Predicate<"Subtarget->hasSALUFloatInsts()">,
- AssemblerPredicate<(all_of FeatureSALUFloatInsts)>;
+def NotHasMADIntraFwdBug : Predicate<"!Subtarget->hasMADIntraFwdBug()">;
def NotHasSALUFloatInsts : Predicate<"!Subtarget->hasSALUFloatInsts()">,
AssemblerPredicate<(all_of (not FeatureSALUFloatInsts))>;
-def HasPseudoScalarTrans : Predicate<"Subtarget->hasPseudoScalarTrans()">,
- AssemblerPredicate<(all_of FeaturePseudoScalarTrans)>;
-
-def HasBitOp3Insts : Predicate<"Subtarget->hasBitOp3Insts()">,
- AssemblerPredicate<(all_of FeatureBitOp3Insts)>;
-
-def HasTanhInsts : Predicate<"Subtarget->hasTanhInsts()">,
- AssemblerPredicate<(all_of FeatureTanhInsts)>;
-
-def HasTensorCvtLutInsts : Predicate<"Subtarget->hasTensorCvtLutInsts()">,
- AssemblerPredicate<(all_of FeatureTensorCvtLutInsts)>;
-
-def HasTransposeLoadF4F6Insts : Predicate<"Subtarget->hasTransposeLoadF4F6Insts()">,
- AssemblerPredicate<(all_of FeatureTransposeLoadF4F6Insts)>;
-
-def HasPrngInst : Predicate<"Subtarget->hasPrngInst()">,
- AssemblerPredicate<(all_of FeaturePrngInst)>;
+def NotHasCvtFP8VOP1Bug : Predicate<"!Subtarget->hasCvtFP8VOP1Bug()">;
-def HasBVHDualAndBVH8Insts : Predicate<"Subtarget->hasBVHDualAndBVH8Insts()">,
- AssemblerPredicate<(all_of FeatureBVHDualAndBVH8Insts)>;
-
-def Has64BitLiterals : Predicate<"Subtarget->has64BitLiterals()">,
- AssemblerPredicate<(all_of Feature64BitLiterals)>;
-
-def Has1024AddressableVGPRs : Predicate<"Subtarget->has1024AddressableVGPRs()">,
- AssemblerPredicate<(all_of Feature1024AddressableVGPRs)>;
-
-def HasWaitXcnt : Predicate<"Subtarget->hasWaitXcnt()">,
- AssemblerPredicate<(all_of FeatureWaitXcnt)>;
-
-def HasFP8ConversionScaleInsts : Predicate<"Subtarget->hasFP8ConversionScaleInsts()">,
- AssemblerPredicate<(all_of FeatureFP8ConversionScaleInsts)>;
-
-def HasBF8ConversionScaleInsts : Predicate<"Subtarget->hasBF8ConversionScaleInsts()">,
- AssemblerPredicate<(all_of FeatureBF8ConversionScaleInsts)>;
-
-def HasFP4ConversionScaleInsts : Predicate<"Subtarget->hasFP4ConversionScaleInsts()">,
- AssemblerPredicate<(all_of FeatureFP4ConversionScaleInsts)>;
-
-def HasFP6BF6ConversionScaleInsts : Predicate<"Subtarget->hasFP6BF6ConversionScaleInsts()">,
- AssemblerPredicate<(all_of FeatureFP6BF6ConversionScaleInsts)>;
-
-def HasF16BF16ToFP6BF6ConversionScaleInsts : Predicate<"Subtarget->hasF16BF16ToFP6BF6ConversionScaleInsts()">,
- AssemblerPredicate<(all_of FeatureF16BF16ToFP6BF6ConversionScaleInsts)>;
-
-def HasCvtPkF16F32Inst : Predicate<"Subtarget->hasCvtPkF16F32Inst()">,
- AssemblerPredicate<(all_of FeatureCvtPkF16F32Inst)>;
-
-def HasF32ToF16BF16ConversionSRInsts : Predicate<"Subtarget->hasF32ToF16BF16ConversionSRInsts()">,
- AssemblerPredicate<(all_of FeatureF32ToF16BF16ConversionSRInsts)>;
-
-def HasGDS : Predicate<"Subtarget->hasGDS()">;
-
-def HasGWS : Predicate<"Subtarget->hasGWS()">;
-
-def HasCvtFP8VOP1Bug : Predicate<"Subtarget->hasCvtFP8VOP1Bug()">;
-def HasNoCvtFP8VOP1Bug : Predicate<"!Subtarget->hasCvtFP8VOP1Bug()">;
-
-def HasAtomicCSubNoRtnInsts : Predicate<"Subtarget->hasAtomicCSubNoRtnInsts()">;
-
-def HasScalarDwordx3Loads : Predicate<"Subtarget->hasScalarDwordx3Loads()">;
-
-def HasXF32Insts : Predicate<"Subtarget->hasXF32Insts()">,
- AssemblerPredicate<(all_of FeatureXF32Insts)>;
-
-def HasVmemPrefInsts : Predicate<"Subtarget->hasVmemPrefInsts()">,
- AssemblerPredicate<(all_of FeatureVmemPrefInsts)>;
-
-def HasAshrPkInsts : Predicate<"Subtarget->hasAshrPkInsts()">,
- AssemblerPredicate<(all_of FeatureAshrPkInsts)>;
-
-def HasLshlAddU64Inst : Predicate<"Subtarget->hasLshlAddU64Inst()">,
- AssemblerPredicate<(all_of FeatureLshlAddU64Inst)>;
-
-def HasAddSubU64Insts : Predicate<"Subtarget->hasAddSubU64Insts()">,
- AssemblerPredicate<(all_of FeatureAddSubU64Insts)>;
-
-def HasMadU32Inst : Predicate<"Subtarget->hasMadU32Inst()">,
- AssemblerPredicate<(all_of FeatureMadU32Inst)>;
+def NeedsAlignedVGPRs : Predicate<"Subtarget->needsAlignedVGPRs()">,
+ AssemblerPredicate<(all_of FeatureRequiresAlignedVGPRs)>;
-def HasLdsBarrierArriveAtomic : Predicate<"Subtarget->hasLdsBarrierArriveAtomic()">,
- AssemblerPredicate<(all_of FeatureLdsBarrierArriveAtomic)>;
+def NotNeedsAlignedVGPRs : Predicate<"!Subtarget->needsAlignedVGPRs()">,
+ AssemblerPredicate<(all_of (not FeatureRequiresAlignedVGPRs))>;
-def HasSetPrioIncWgInst : Predicate<"Subtarget->hasSetPrioIncWgInst()">,
- AssemblerPredicate<(all_of FeatureSetPrioIncWgInst)>;
+def isWave32 : Predicate<"Subtarget->isWave32()">,
+ AssemblerPredicate <(any_of FeatureWavefrontSize32,
+ FeatureAssemblerPermissiveWavesize)>;
+def isWave64 : Predicate<"Subtarget->isWave64()">,
+ AssemblerPredicate <(any_of FeatureWavefrontSize64,
+ FeatureAssemblerPermissiveWavesize)>;
-def NeedsAlignedVGPRs : Predicate<"Subtarget->needsAlignedVGPRs()">,
- AssemblerPredicate<(all_of FeatureRequiresAlignedVGPRs)>;
+def isWave32Strict : Predicate<"Subtarget->isWave32()">,
+ AssemblerPredicate <(all_of FeatureWavefrontSize32)>;
+def isWave64Strict : Predicate<"Subtarget->isWave64()">,
+ AssemblerPredicate <(all_of FeatureWavefrontSize64)>;
//===----------------------------------------------------------------------===//
// HwModes
//===----------------------------------------------------------------------===//
-// gfx90a-gfx950. Has AGPRs, and also the align2 VGPR/AGPR requirement
+defvar DefaultMode_Wave64 = DefaultMode;
+defvar DefaultMode_Wave32 = HwMode<[isWave32, NotNeedsAlignedVGPRs]>;
+
+// gfx90a-gfx950. Has AGPRs, and also the align2 VGPR/AGPR requirement. Implied
+// wave64.
def AVAlign2LoadStoreMode : HwMode<[HasMAIInsts, NeedsAlignedVGPRs]>;
// gfx1250, has alignment requirement but no AGPRs.
-def AlignedVGPRNoAGPRMode : HwMode<[NotHasMAIInsts, NeedsAlignedVGPRs]>;
+def AlignedVGPRNoAGPRMode_Wave32 : HwMode<[NotHasMAIInsts, NeedsAlignedVGPRs, isWave32Strict]>;
+def AlignedVGPRNoAGPRMode_Wave64 : HwMode<[NotHasMAIInsts, NeedsAlignedVGPRs, isWave64Strict]>;
+
+// FIXME: This should be able to only define a separate hwmode that
+// only depends on wavesize for just ValueTypes. These use different
+// HwMode namespaces. If we don't define the full set of modes used
+// for RegClassByHwMode, tablegen crashes for some reason
+def WaveSizeVT : ValueTypeByHwMode<[
+ DefaultMode_Wave64,
+ AVAlign2LoadStoreMode,
+ AlignedVGPRNoAGPRMode_Wave64,
+ DefaultMode_Wave32,
+ AlignedVGPRNoAGPRMode_Wave32], [i64, i64, i64, i32, i32]>;
// Include AMDGPU TD files
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
index c28c25f..2bdadda 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
@@ -65,7 +65,7 @@ recursivelyVisitUsers(GlobalValue &GV,
continue;
if (Instruction *I = dyn_cast<Instruction>(U)) {
- Function *F = I->getParent()->getParent();
+ Function *F = I->getFunction();
if (!AMDGPU::isEntryFunctionCC(F->getCallingConv())) {
// FIXME: This is a horrible hack. We should always respect noinline,
// and just let us hit the error when we can't handle this.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
index dda8033..346e257 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
@@ -19,7 +19,7 @@ using namespace llvm;
#define DEBUG_TYPE "amdgpu-argument-reg-usage-info"
-INITIALIZE_PASS(AMDGPUArgumentUsageInfo, DEBUG_TYPE,
+INITIALIZE_PASS(AMDGPUArgumentUsageInfoWrapperLegacy, DEBUG_TYPE,
"Argument Register Usage Information Storage", false, true)
void ArgDescriptor::print(raw_ostream &OS,
@@ -42,7 +42,7 @@ void ArgDescriptor::print(raw_ostream &OS,
OS << '\n';
}
-char AMDGPUArgumentUsageInfo::ID = 0;
+char AMDGPUArgumentUsageInfoWrapperLegacy::ID = 0;
const AMDGPUFunctionArgInfo AMDGPUArgumentUsageInfo::ExternFunctionInfo{};
@@ -50,15 +50,6 @@ const AMDGPUFunctionArgInfo AMDGPUArgumentUsageInfo::ExternFunctionInfo{};
const AMDGPUFunctionArgInfo AMDGPUArgumentUsageInfo::FixedABIFunctionInfo
= AMDGPUFunctionArgInfo::fixedABILayout();
-bool AMDGPUArgumentUsageInfo::doInitialization(Module &M) {
- return false;
-}
-
-bool AMDGPUArgumentUsageInfo::doFinalization(Module &M) {
- ArgInfoMap.clear();
- return false;
-}
-
// TODO: Print preload kernargs?
void AMDGPUArgumentUsageInfo::print(raw_ostream &OS, const Module *M) const {
for (const auto &FI : ArgInfoMap) {
@@ -86,6 +77,12 @@ void AMDGPUArgumentUsageInfo::print(raw_ostream &OS, const Module *M) const {
}
}
+bool AMDGPUArgumentUsageInfo::invalidate(Module &M, const PreservedAnalyses &PA,
+ ModuleAnalysisManager::Invalidator &) {
+ auto PAC = PA.getChecker<AMDGPUArgumentUsageAnalysis>();
+ return !PAC.preservedWhenStateless();
+}
+
std::tuple<const ArgDescriptor *, const TargetRegisterClass *, LLT>
AMDGPUFunctionArgInfo::getPreloadedValue(
AMDGPUFunctionArgInfo::PreloadedValue Value) const {
@@ -191,3 +188,10 @@ AMDGPUArgumentUsageInfo::lookupFuncArgInfo(const Function &F) const {
return FixedABIFunctionInfo;
return I->second;
}
+
+AnalysisKey AMDGPUArgumentUsageAnalysis::Key;
+
+AMDGPUArgumentUsageInfo
+AMDGPUArgumentUsageAnalysis::run(Module &M, ModuleAnalysisManager &) {
+ return AMDGPUArgumentUsageInfo();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
index 1064e57..f38e49b9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -12,7 +12,10 @@
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/CodeGen/Register.h"
+#include "llvm/IR/PassManager.h"
#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include <variant>
namespace llvm {
@@ -27,55 +30,44 @@ private:
friend struct AMDGPUFunctionArgInfo;
friend class AMDGPUArgumentUsageInfo;
- union {
- MCRegister Reg;
- unsigned StackOffset;
- };
+ std::variant<std::monostate, MCRegister, unsigned> Val;
// Bitmask to locate argument within the register.
unsigned Mask;
- bool IsStack : 1;
- bool IsSet : 1;
-
public:
- ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u, bool IsStack = false,
- bool IsSet = false)
- : Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {}
+ ArgDescriptor(unsigned Mask = ~0u) : Mask(Mask) {}
static ArgDescriptor createRegister(Register Reg, unsigned Mask = ~0u) {
- return ArgDescriptor(Reg, Mask, false, true);
+ ArgDescriptor Ret(Mask);
+ Ret.Val = Reg.asMCReg();
+ return Ret;
}
static ArgDescriptor createStack(unsigned Offset, unsigned Mask = ~0u) {
- return ArgDescriptor(Offset, Mask, true, true);
+ ArgDescriptor Ret(Mask);
+ Ret.Val = Offset;
+ return Ret;
}
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask) {
- return ArgDescriptor(Arg.Reg, Mask, Arg.IsStack, Arg.IsSet);
+ // Copy the descriptor, then change the mask.
+ ArgDescriptor Ret(Arg);
+ Ret.Mask = Mask;
+ return Ret;
}
- bool isSet() const {
- return IsSet;
- }
+ bool isSet() const { return !std::holds_alternative<std::monostate>(Val); }
explicit operator bool() const {
return isSet();
}
- bool isRegister() const {
- return !IsStack;
- }
+ bool isRegister() const { return std::holds_alternative<MCRegister>(Val); }
- MCRegister getRegister() const {
- assert(!IsStack);
- return Reg;
- }
+ MCRegister getRegister() const { return std::get<MCRegister>(Val); }
- unsigned getStackOffset() const {
- assert(IsStack);
- return StackOffset;
- }
+ unsigned getStackOffset() const { return std::get<unsigned>(Val); }
unsigned getMask() const {
// None of the target SGPRs or VGPRs are expected to have a 'zero' mask.
@@ -96,7 +88,7 @@ inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) {
}
struct KernArgPreloadDescriptor : public ArgDescriptor {
- KernArgPreloadDescriptor() {}
+ KernArgPreloadDescriptor() = default;
SmallVector<MCRegister> Regs;
};
@@ -178,32 +170,67 @@ struct AMDGPUFunctionArgInfo {
static AMDGPUFunctionArgInfo fixedABILayout();
};
-class AMDGPUArgumentUsageInfo : public ImmutablePass {
+class AMDGPUArgumentUsageInfo {
private:
DenseMap<const Function *, AMDGPUFunctionArgInfo> ArgInfoMap;
public:
- static char ID;
-
static const AMDGPUFunctionArgInfo ExternFunctionInfo;
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo;
- AMDGPUArgumentUsageInfo() : ImmutablePass(ID) { }
+ void print(raw_ostream &OS, const Module *M = nullptr) const;
+
+ void clear() { ArgInfoMap.clear(); }
+
+ void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo) {
+ ArgInfoMap[&F] = ArgInfo;
+ }
+
+ const AMDGPUFunctionArgInfo &lookupFuncArgInfo(const Function &F) const;
+
+ bool invalidate(Module &M, const PreservedAnalyses &PA,
+ ModuleAnalysisManager::Invalidator &Inv);
+};
+
+class AMDGPUArgumentUsageInfoWrapperLegacy : public ImmutablePass {
+ std::unique_ptr<AMDGPUArgumentUsageInfo> AUIP;
+
+public:
+ static char ID;
+
+ AMDGPUArgumentUsageInfoWrapperLegacy() : ImmutablePass(ID) {}
+
+ AMDGPUArgumentUsageInfo &getArgUsageInfo() { return *AUIP; }
+ const AMDGPUArgumentUsageInfo &getArgUsageInfo() const { return *AUIP; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesAll();
}
- bool doInitialization(Module &M) override;
- bool doFinalization(Module &M) override;
+ bool doInitialization(Module &M) override {
+ AUIP = std::make_unique<AMDGPUArgumentUsageInfo>();
+ return false;
+ }
- void print(raw_ostream &OS, const Module *M = nullptr) const override;
+ bool doFinalization(Module &M) override {
+ AUIP->clear();
+ return false;
+ }
- void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo) {
- ArgInfoMap[&F] = ArgInfo;
+ void print(raw_ostream &OS, const Module *M = nullptr) const override {
+ AUIP->print(OS, M);
}
+};
- const AMDGPUFunctionArgInfo &lookupFuncArgInfo(const Function &F) const;
+class AMDGPUArgumentUsageAnalysis
+ : public AnalysisInfoMixin<AMDGPUArgumentUsageAnalysis> {
+ friend AnalysisInfoMixin<AMDGPUArgumentUsageAnalysis>;
+ static AnalysisKey Key;
+
+public:
+ using Result = AMDGPUArgumentUsageInfo;
+
+ AMDGPUArgumentUsageInfo run(Module &M, ModuleAnalysisManager &);
};
} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp
index 19e2a6a..9af3b05 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp
@@ -208,7 +208,8 @@ void instrumentAddress(Module &M, IRBuilder<> &IRB, Instruction *OrigIns,
Value *NumBits = IRB.CreateTypeSize(IntptrTy, TypeStoreSize);
Value *Size = IRB.CreateLShr(NumBits, ConstantInt::get(IntptrTy, 3));
Value *AddrLong = IRB.CreatePtrToInt(Addr, IntptrTy);
- Value *SizeMinusOne = IRB.CreateAdd(Size, ConstantInt::get(IntptrTy, -1));
+ Value *SizeMinusOne =
+ IRB.CreateAdd(Size, ConstantInt::getAllOnesValue(IntptrTy));
Value *LastByte =
IRB.CreateIntToPtr(IRB.CreateAdd(AddrLong, SizeMinusOne), AddrTy);
instrumentAddressImpl(M, IRB, OrigIns, InsertBefore, Addr, {}, 8, IsWrite,
@@ -244,11 +245,8 @@ void getInterestingMemoryOperands(
// Masked store has an initial operand for the value.
unsigned OpOffset = IsWrite ? 1 : 0;
Type *Ty = IsWrite ? CI->getArgOperand(0)->getType() : CI->getType();
- MaybeAlign Alignment = Align(1);
- // Otherwise no alignment guarantees. We probably got Undef.
- if (auto *Op = dyn_cast<ConstantInt>(CI->getOperand(1 + OpOffset)))
- Alignment = Op->getMaybeAlignValue();
- Value *Mask = CI->getOperand(2 + OpOffset);
+ MaybeAlign Alignment = CI->getParamAlign(OpOffset);
+ Value *Mask = CI->getOperand(1 + OpOffset);
Interesting.emplace_back(I, OpOffset, IsWrite, Ty, Alignment, Mask);
break;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 29f8f9b..7d2df427 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -323,7 +323,7 @@ void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
"' is already defined");
const DataLayout &DL = GV->getDataLayout();
- uint64_t Size = DL.getTypeAllocSize(GV->getValueType());
+ uint64_t Size = GV->getGlobalSize(DL);
Align Alignment = GV->getAlign().value_or(Align(4));
emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
@@ -358,6 +358,32 @@ bool AMDGPUAsmPrinter::doInitialization(Module &M) {
return AsmPrinter::doInitialization(M);
}
+/// Mimics GCNSubtarget::computeOccupancy for MCExpr.
+///
+/// Remove dependency on GCNSubtarget and depend only only the necessary values
+/// for said occupancy computation. Should match computeOccupancy implementation
+/// without passing \p STM on.
+const AMDGPUMCExpr *createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs,
+ const MCExpr *NumVGPRs,
+ unsigned DynamicVGPRBlockSize,
+ const GCNSubtarget &STM, MCContext &Ctx) {
+ unsigned MaxWaves = IsaInfo::getMaxWavesPerEU(&STM);
+ unsigned Granule = IsaInfo::getVGPRAllocGranule(&STM, DynamicVGPRBlockSize);
+ unsigned TargetTotalNumVGPRs = IsaInfo::getTotalNumVGPRs(&STM);
+ unsigned Generation = STM.getGeneration();
+
+ auto CreateExpr = [&Ctx](unsigned Value) {
+ return MCConstantExpr::create(Value, Ctx);
+ };
+
+ return AMDGPUMCExpr::create(AMDGPUMCExpr::AGVK_Occupancy,
+ {CreateExpr(MaxWaves), CreateExpr(Granule),
+ CreateExpr(TargetTotalNumVGPRs),
+ CreateExpr(Generation), CreateExpr(InitOcc),
+ NumSGPRs, NumVGPRs},
+ Ctx);
+}
+
void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
if (F.isDeclaration() || !AMDGPU::isModuleEntryFunctionCC(F.getCallingConv()))
return;
@@ -459,7 +485,7 @@ void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
MaxWaves, MFI.getDynamicVGPRBlockSize())});
uint64_t NumSGPRsForWavesPerEU = std::max(
{NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(MaxWaves)});
- const MCExpr *OccupancyExpr = AMDGPUMCExpr::createOccupancy(
+ const MCExpr *OccupancyExpr = createOccupancy(
STM.getOccupancyWithWorkGroupSizes(*MF).second,
MCConstantExpr::create(NumSGPRsForWavesPerEU, OutContext),
MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext),
@@ -508,9 +534,9 @@ bool AMDGPUAsmPrinter::doFinalization(Module &M) {
MCSectionELF *MaxGPRSection =
OutContext.getELFSection(".AMDGPU.gpr_maximums", ELF::SHT_PROGBITS, 0);
OutStreamer->switchSection(MaxGPRSection);
- getTargetStreamer()->EmitMCResourceMaximums(RI.getMaxVGPRSymbol(OutContext),
- RI.getMaxAGPRSymbol(OutContext),
- RI.getMaxSGPRSymbol(OutContext));
+ getTargetStreamer()->EmitMCResourceMaximums(
+ RI.getMaxVGPRSymbol(OutContext), RI.getMaxAGPRSymbol(OutContext),
+ RI.getMaxSGPRSymbol(OutContext), RI.getMaxNamedBarrierSymbol(OutContext));
OutStreamer->popSection();
for (Function &F : M.functions())
@@ -634,7 +660,7 @@ AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
(void)PGRM_Rsrc3;
(void)EvaluatableRsrc3;
assert(STM.getGeneration() >= AMDGPUSubtarget::GFX10 ||
- STM.hasGFX90AInsts() || AMDGPU::isGFX1250(STM) || !EvaluatableRsrc3 ||
+ STM.hasGFX90AInsts() || STM.hasGFX1250Insts() || !EvaluatableRsrc3 ||
static_cast<uint64_t>(PGRM_Rsrc3) == 0);
KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3;
@@ -805,7 +831,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
" AccumOffset: " + getMCExprStr(AdjustedAccum), false);
}
- if (AMDGPU::isGFX1250(STM))
+ if (STM.hasGFX1250Insts())
OutStreamer->emitRawComment(
" NamedBarCnt: " + getMCExprStr(CurrentProgramInfo.NamedBarCnt),
false);
@@ -841,7 +867,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
[[maybe_unused]] int64_t PGMRSrc3;
assert(STM.getGeneration() >= AMDGPUSubtarget::GFX10 ||
- STM.hasGFX90AInsts() || AMDGPU::isGFX1250(STM) ||
+ STM.hasGFX90AInsts() || STM.hasGFX1250Insts() ||
(CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGMRSrc3) &&
static_cast<uint64_t>(PGMRSrc3) == 0));
if (STM.hasGFX90AInsts()) {
@@ -1160,21 +1186,20 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
// Make clamp modifier on NaN input returns 0.
ProgInfo.DX10Clamp = Mode.DX10Clamp;
- unsigned LDSAlignShift;
- if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize327680)) {
- // LDS is allocated in 256 dword blocks.
- LDSAlignShift = 10;
- } else if (STM.getFeatureBits().test(
- FeatureAddressableLocalMemorySize163840)) {
- // LDS is allocated in 320 dword blocks.
+ unsigned LDSAlignShift = 8;
+ switch (getLdsDwGranularity(STM)) {
+ case 512:
+ case 320:
LDSAlignShift = 11;
- } else if (STM.getFeatureBits().test(
- FeatureAddressableLocalMemorySize65536)) {
- // LDS is allocated in 128 dword blocks.
+ break;
+ case 128:
LDSAlignShift = 9;
- } else {
- // LDS is allocated in 64 dword blocks.
+ break;
+ case 64:
LDSAlignShift = 8;
+ break;
+ default:
+ llvm_unreachable("invald LDS block size");
}
ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
@@ -1230,8 +1255,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ProgInfo.UserSGPR = MFI->getNumUserSGPRs();
// For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
- ProgInfo.TrapHandlerEnable =
- STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled();
+ ProgInfo.TrapHandlerEnable = STM.isAmdHsaOS() ? 0 : STM.hasTrapHandler();
ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX();
ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY();
ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ();
@@ -1264,13 +1288,13 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
}
- if (AMDGPU::isGFX1250(STM))
+ if (STM.hasGFX1250Insts())
ProgInfo.ComputePGMRSrc3 =
SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.NamedBarCnt,
amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT,
amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT);
- ProgInfo.Occupancy = AMDGPUMCExpr::createOccupancy(
+ ProgInfo.Occupancy = createOccupancy(
STM.computeOccupancy(F, ProgInfo.LDSSize).second,
ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU,
MFI->getDynamicVGPRBlockSize(), STM, Ctx);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index 0a163f8..784ee36 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -589,7 +589,7 @@ std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively(
// return the next active lane
auto *Mask = B.CreateShl(ConstantInt::get(WaveTy, 1), FF1);
- auto *InverseMask = B.CreateXor(Mask, ConstantInt::get(WaveTy, -1));
+ auto *InverseMask = B.CreateXor(Mask, ConstantInt::getAllOnesValue(WaveTy));
auto *NewActiveBits = B.CreateAnd(ActiveBits, InverseMask);
ActiveBits->addIncoming(NewActiveBits, ComputeLoop);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 9907c88f..b86a4ea 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -38,9 +38,10 @@ enum ImplicitArgumentPositions {
#define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS,
enum ImplicitArgumentMask {
- NOT_IMPLICIT_INPUT = 0,
+ UNKNOWN_INTRINSIC = 0,
#include "AMDGPUAttributes.def"
- ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1
+ ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1,
+ NOT_IMPLICIT_INPUT
};
#define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str},
@@ -115,7 +116,7 @@ intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit,
NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
return QUEUE_PTR;
default:
- return NOT_IMPLICIT_INPUT;
+ return UNKNOWN_INTRINSIC;
}
}
@@ -200,16 +201,6 @@ public:
/// Get code object version.
unsigned getCodeObjectVersion() const { return CodeObjectVersion; }
- /// Get the effective value of "amdgpu-waves-per-eu" for the function,
- /// accounting for the interaction with the passed value to use for
- /// "amdgpu-flat-work-group-size".
- std::pair<unsigned, unsigned>
- getWavesPerEU(const Function &F,
- std::pair<unsigned, unsigned> FlatWorkGroupSize) {
- const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
- return ST.getWavesPerEU(FlatWorkGroupSize, getLDSSize(F), F);
- }
-
std::optional<std::pair<unsigned, unsigned>>
getWavesPerEUAttr(const Function &F) {
auto Val = AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu",
@@ -223,15 +214,6 @@ public:
return std::make_pair(Val->first, *(Val->second));
}
- std::pair<unsigned, unsigned>
- getEffectiveWavesPerEU(const Function &F,
- std::pair<unsigned, unsigned> WavesPerEU,
- std::pair<unsigned, unsigned> FlatWorkGroupSize) {
- const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
- return ST.getEffectiveWavesPerEU(WavesPerEU, FlatWorkGroupSize,
- getLDSSize(F));
- }
-
unsigned getMaxWavesPerEU(const Function &F) {
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
return ST.getMaxWavesPerEU();
@@ -258,14 +240,6 @@ private:
return Status;
}
- /// Returns the minimum amount of LDS space used by a workgroup running
- /// function \p F.
- static unsigned getLDSSize(const Function &F) {
- return AMDGPU::getIntegerPairAttribute(F, "amdgpu-lds-size",
- {0, UINT32_MAX}, true)
- .first;
- }
-
/// Get the constant access bitmap for \p C.
uint8_t getConstantAccess(const Constant *C,
SmallPtrSetImpl<const Constant *> &Visited) {
@@ -534,6 +508,21 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
ImplicitArgumentMask AttrMask =
intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit,
HasApertureRegs, SupportsGetDoorbellID, COV);
+
+ if (AttrMask == UNKNOWN_INTRINSIC) {
+ // Assume not-nocallback intrinsics may invoke a function which accesses
+ // implicit arguments.
+ //
+ // FIXME: This isn't really the correct check. We want to ensure it
+ // isn't calling any function that may use implicit arguments regardless
+ // of whether it's internal to the module or not.
+ //
+ // TODO: Ignoring callsite attributes.
+ if (!Callee->hasFnAttribute(Attribute::NoCallback))
+ return indicatePessimisticFixpoint();
+ continue;
+ }
+
if (AttrMask != NOT_IMPLICIT_INPUT) {
if ((IsNonEntryFunc || !NonKernelOnly))
removeAssumedBits(AttrMask);
@@ -1336,7 +1325,6 @@ struct AAAMDGPUMinAGPRAlloc
Maximum.takeAssumedMaximum(NumRegs);
return true;
}
-
switch (CB.getIntrinsicID()) {
case Intrinsic::not_intrinsic:
break;
@@ -1354,10 +1342,24 @@ struct AAAMDGPUMinAGPRAlloc
return true;
}
+ // Trap-like intrinsics such as llvm.trap and llvm.debugtrap do not have
+ // the nocallback attribute, so the AMDGPU attributor can conservatively
+ // drop all implicitly-known inputs and AGPR allocation information. Make
+ // sure we still infer that no implicit inputs are required and that the
+ // AGPR allocation stays at zero. Trap-like intrinsics may invoke a
+ // function which requires AGPRs, so we need to check if the called
+ // function has the "trap-func-name" attribute.
+ case Intrinsic::trap:
+ case Intrinsic::debugtrap:
+ case Intrinsic::ubsantrap:
+ return CB.hasFnAttr(Attribute::NoCallback) ||
+ !CB.hasFnAttr("trap-func-name");
default:
// Some intrinsics may use AGPRs, but if we have a choice, we are not
// required to use AGPRs.
- return true;
+ // Assume !nocallback intrinsics may call a function which requires
+ // AGPRs.
+ return CB.hasFnAttr(Attribute::NoCallback);
}
// TODO: Handle callsite attributes
@@ -1555,7 +1557,7 @@ private:
AMDGPU::ClusterDimsAttr Attr;
- static constexpr const char AttrName[] = "amdgpu-cluster-dims";
+ static constexpr char AttrName[] = "amdgpu-cluster-dims";
};
AAAMDGPUClusterDims &
@@ -1584,7 +1586,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
&AAAMDGPUMinAGPRAlloc::ID, &AACallEdges::ID, &AAPointerInfo::ID,
&AAPotentialConstantValues::ID, &AAUnderlyingObjects::ID,
&AANoAliasAddrSpace::ID, &AAAddressSpace::ID, &AAIndirectCallInfo::ID,
- &AAAMDGPUClusterDims::ID});
+ &AAAMDGPUClusterDims::ID, &AAAlign::ID});
AttributorConfig AC(CGUpdater);
AC.IsClosedWorldModule = Options.IsClosedWorld;
@@ -1642,6 +1644,10 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
if (Ptr) {
A.getOrCreateAAFor<AAAddressSpace>(IRPosition::value(*Ptr));
A.getOrCreateAAFor<AANoAliasAddrSpace>(IRPosition::value(*Ptr));
+ if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Ptr)) {
+ if (II->getIntrinsicID() == Intrinsic::amdgcn_make_buffer_rsrc)
+ A.getOrCreateAAFor<AAAlign>(IRPosition::value(*Ptr));
+ }
}
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp
new file mode 100644
index 0000000..c9fcec8
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp
@@ -0,0 +1,120 @@
+//===--- AMDGPUBarrierLatency.cpp - AMDGPU Barrier Latency ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains a DAG scheduling mutation to add latency to:
+/// 1. Barrier edges between ATOMIC_FENCE instructions and preceding
+/// memory accesses potentially affected by the fence.
+/// This encourages the scheduling of more instructions before
+/// ATOMIC_FENCE instructions. ATOMIC_FENCE instructions may
+/// introduce wait counting or indicate an impending S_BARRIER
+/// wait. Having more instructions in-flight across these
+/// constructs improves latency hiding.
+/// 2. Barrier edges from S_BARRIER_SIGNAL to S_BARRIER_WAIT.
+/// This encourages independent work to be scheduled between
+/// signal and wait, hiding barrier synchronization latency.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUBarrierLatency.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+static cl::opt<unsigned> BarrierSignalWaitLatencyOpt(
+ "amdgpu-barrier-signal-wait-latency",
+ cl::desc("Synthetic latency between S_BARRIER_SIGNAL and S_BARRIER_WAIT "
+ "to encourage scheduling independent work between them"),
+ cl::init(16), cl::Hidden);
+
+namespace {
+
+class BarrierLatency : public ScheduleDAGMutation {
+private:
+ SmallSet<SyncScope::ID, 4> IgnoredScopes;
+
+public:
+ BarrierLatency(MachineFunction *MF) {
+ LLVMContext &Context = MF->getFunction().getContext();
+ IgnoredScopes.insert(SyncScope::SingleThread);
+ IgnoredScopes.insert(Context.getOrInsertSyncScopeID("wavefront"));
+ IgnoredScopes.insert(Context.getOrInsertSyncScopeID("wavefront-one-as"));
+ IgnoredScopes.insert(Context.getOrInsertSyncScopeID("singlethread-one-as"));
+
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+ if (!ST.requiresWaitOnWorkgroupReleaseFence()) {
+ // Prior to GFX10 workgroup scope does not normally require waitcnts
+ IgnoredScopes.insert(Context.getOrInsertSyncScopeID("workgroup"));
+ }
+ }
+ void apply(ScheduleDAGInstrs *DAG) override;
+};
+
+void addLatencyToEdge(SDep &PredDep, SUnit &SU, unsigned Latency) {
+ SUnit *PredSU = PredDep.getSUnit();
+ SDep ForwardD = PredDep;
+ ForwardD.setSUnit(&SU);
+ for (SDep &SuccDep : PredSU->Succs) {
+ if (SuccDep == ForwardD) {
+ SuccDep.setLatency(SuccDep.getLatency() + Latency);
+ break;
+ }
+ }
+ PredDep.setLatency(PredDep.getLatency() + Latency);
+ PredSU->setDepthDirty();
+ SU.setDepthDirty();
+}
+
+void BarrierLatency::apply(ScheduleDAGInstrs *DAG) {
+ const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(DAG->TII);
+ constexpr unsigned FenceLatency = 2000;
+ const unsigned BarrierSignalWaitLatency = BarrierSignalWaitLatencyOpt;
+
+ for (SUnit &SU : DAG->SUnits) {
+ const MachineInstr *MI = SU.getInstr();
+ unsigned Op = MI->getOpcode();
+
+ if (Op == AMDGPU::ATOMIC_FENCE) {
+ // Update latency on barrier edges of ATOMIC_FENCE.
+ // Ignore scopes not expected to have any latency.
+ SyncScope::ID SSID =
+ static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
+ if (IgnoredScopes.contains(SSID))
+ continue;
+
+ for (SDep &PredDep : SU.Preds) {
+ if (!PredDep.isBarrier())
+ continue;
+ SUnit *PredSU = PredDep.getSUnit();
+ MachineInstr *MI = PredSU->getInstr();
+ // Only consider memory loads
+ if (!MI->mayLoad() || MI->mayStore())
+ continue;
+ addLatencyToEdge(PredDep, SU, FenceLatency);
+ }
+ } else if (Op == AMDGPU::S_BARRIER_WAIT) {
+ for (SDep &PredDep : SU.Preds) {
+ SUnit *PredSU = PredDep.getSUnit();
+ const MachineInstr *PredMI = PredSU->getInstr();
+ if (TII->isBarrierStart(PredMI->getOpcode())) {
+ addLatencyToEdge(PredDep, SU, BarrierSignalWaitLatency);
+ }
+ }
+ }
+ }
+}
+
+} // end namespace
+
+std::unique_ptr<ScheduleDAGMutation>
+llvm::createAMDGPUBarrierLatencyDAGMutation(MachineFunction *MF) {
+ return std::make_unique<BarrierLatency>(MF);
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h
new file mode 100644
index 0000000..547cd2a
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h
@@ -0,0 +1,24 @@
+//===- AMDGPUBarrierLatency.h - AMDGPU Export Clustering --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUBARRIERLATENCY_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUBARRIERLATENCY_H
+
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
+#include <memory>
+
+namespace llvm {
+
+class MachineFunction;
+
+std::unique_ptr<ScheduleDAGMutation>
+createAMDGPUBarrierLatencyDAGMutation(MachineFunction *MF);
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUBARRIERLATENCY_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 7afadde..5c6affd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -21,6 +21,7 @@
#include "llvm/CodeGen/FunctionLoweringInfo.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/PseudoSourceValueManager.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#define DEBUG_TYPE "amdgpu-call-lowering"
@@ -209,7 +210,7 @@ struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler {
if (!SPReg) {
const GCNSubtarget &ST = MIRBuilder.getMF().getSubtarget<GCNSubtarget>();
- if (ST.enableFlatScratch()) {
+ if (ST.hasFlatScratchEnabled()) {
// The stack is accessed unswizzled, so we can use a regular copy.
SPReg = MIRBuilder.buildCopy(PtrTy,
MFI->getStackPtrOffsetReg()).getReg(0);
@@ -414,12 +415,13 @@ void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, ArgInfo &OrigArg,
MachineFunction &MF = B.getMF();
const Function &F = MF.getFunction();
const DataLayout &DL = F.getDataLayout();
- MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
+ const SITargetLowering &TLI = *getTLI<SITargetLowering>();
+ MachinePointerInfo PtrInfo = TLI.getKernargSegmentPtrInfo(MF);
LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
SmallVector<ArgInfo, 32> SplitArgs;
- SmallVector<uint64_t> FieldOffsets;
+ SmallVector<TypeSize> FieldOffsets;
splitToValueTypes(OrigArg, SplitArgs, DL, F.getCallingConv(), &FieldOffsets);
unsigned Idx = 0;
@@ -737,7 +739,7 @@ bool AMDGPUCallLowering::lowerFormalArguments(
// For the fixed ABI, pass workitem IDs in the last argument register.
TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
- if (!Subtarget.enableFlatScratch())
+ if (!Subtarget.hasFlatScratchEnabled())
CCInfo.AllocateReg(Info->getScratchRSrcReg());
TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
}
@@ -1196,7 +1198,7 @@ void AMDGPUCallLowering::handleImplicitCallArguments(
const GCNSubtarget &ST, const SIMachineFunctionInfo &FuncInfo,
CallingConv::ID CalleeCC,
ArrayRef<std::pair<MCRegister, Register>> ImplicitArgRegs) const {
- if (!ST.enableFlatScratch()) {
+ if (!ST.hasFlatScratchEnabled()) {
// Insert copies for the SRD. In the HSA case, this should be an identity
// copy.
auto ScratchRSrcReg = MIRBuilder.buildCopy(LLT::fixed_vector(4, 32),
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index e891fdb..2932bbf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -14,6 +14,10 @@
class CCIfNotInReg<CCAction A> : CCIf<"!ArgFlags.isInReg()", A> {}
class CCIfExtend<CCAction A>
: CCIf<"ArgFlags.isSExt() || ArgFlags.isZExt()", A>;
+class CCIfOrigTypeShaderCCIsSGPR<CCAction A>
+ : CCIf<[{(!OrigTy->getScalarType()->isFloatTy() &&
+ !OrigTy->getScalarType()->isHalfTy()) }], A>;
+
// Calling convention for SI
def CC_SI_Gfx : CallingConv<[
@@ -56,14 +60,15 @@ def CC_SI_SHADER : CallingConv<[
>>>
]>;
+
def RetCC_SI_Shader : CallingConv<[
CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>,
- CCIfType<[i32, i16, v2i16] , CCAssignToReg<
+ CCIfType<[i32, i16, v2i16], CCIfOrigTypeShaderCCIsSGPR<CCAssignToReg<
!foreach(i, !range(0, 44), !cast<Register>("SGPR"#i)) // SGPR0-43
- >>,
+ >>>,
// 32*4 + 4 is the minimum for a fetch shader with 32 outputs.
- CCIfType<[f32, f16, v2f16, bf16, v2bf16] , CCAssignToReg<
+ CCIfType<[f32, f16, v2f16, bf16, v2bf16, i32, i16, v2i16] , CCAssignToReg<
!foreach(i, !range(0, 136), !cast<Register>("VGPR"#i)) // VGPR0-135
>>
]>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 8e35ba7..e51d2c0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -33,6 +33,7 @@
#include "llvm/Pass.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/KnownFPClass.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/IntegerDivision.h"
#include "llvm/Transforms/Utils/Local.h"
@@ -100,10 +101,9 @@ public:
const GCNSubtarget &ST;
const AMDGPUTargetMachine &TM;
const TargetLibraryInfo *TLI;
- AssumptionCache *AC;
- const DominatorTree *DT;
const UniformityInfo &UA;
const DataLayout &DL;
+ SimplifyQuery SQ;
const bool HasFP32DenormalFlush;
bool FlowChanged = false;
mutable Function *SqrtF32 = nullptr;
@@ -115,8 +115,8 @@ public:
AMDGPUCodeGenPrepareImpl(Function &F, const AMDGPUTargetMachine &TM,
const TargetLibraryInfo *TLI, AssumptionCache *AC,
const DominatorTree *DT, const UniformityInfo &UA)
- : F(F), ST(TM.getSubtarget<GCNSubtarget>(F)), TM(TM), TLI(TLI), AC(AC),
- DT(DT), UA(UA), DL(F.getDataLayout()),
+ : F(F), ST(TM.getSubtarget<GCNSubtarget>(F)), TM(TM), TLI(TLI), UA(UA),
+ DL(F.getDataLayout()), SQ(DL, TLI, DT, AC),
HasFP32DenormalFlush(SIModeRegisterDefaults(F, ST).FP32Denormals ==
DenormalMode::getPreserveSign()) {}
@@ -143,21 +143,14 @@ public:
bool canBreakPHINode(const PHINode &I);
- /// \returns True if binary operation \p I is a signed binary operation, false
- /// otherwise.
- bool isSigned(const BinaryOperator &I) const;
-
- /// \returns True if the condition of 'select' operation \p I comes from a
- /// signed 'icmp' operation, false otherwise.
- bool isSigned(const SelectInst &I) const;
-
/// Return true if \p T is a legal scalar floating point type.
bool isLegalFloatingTy(const Type *T) const;
/// Wrapper to pass all the arguments to computeKnownFPClass
KnownFPClass computeKnownFPClass(const Value *V, FPClassTest Interested,
const Instruction *CtxI) const {
- return llvm::computeKnownFPClass(V, DL, Interested, TLI, AC, CtxI, DT);
+ return llvm::computeKnownFPClass(V, Interested,
+ SQ.getWithInstruction(CtxI));
}
bool canIgnoreDenormalInput(const Value *V, const Instruction *CtxI) const {
@@ -168,12 +161,12 @@ public:
/// \returns The minimum number of bits needed to store the value of \Op as an
/// unsigned integer. Truncating to this size and then zero-extending to
/// the original will not change the value.
- unsigned numBitsUnsigned(Value *Op) const;
+ unsigned numBitsUnsigned(Value *Op, const Instruction *CtxI) const;
/// \returns The minimum number of bits needed to store the value of \Op as a
/// signed integer. Truncating to this size and then sign-extending to
/// the original size will not change the value.
- unsigned numBitsSigned(Value *Op) const;
+ unsigned numBitsSigned(Value *Op, const Instruction *CtxI) const;
/// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
/// SelectionDAG has an issue where an and asserting the bits are known
@@ -218,8 +211,7 @@ public:
Value *matchFractPat(IntrinsicInst &I);
Value *applyFractPat(IRBuilder<> &Builder, Value *FractArg);
- bool canOptimizeWithRsq(const FPMathOperator *SqrtOp, FastMathFlags DivFMF,
- FastMathFlags SqrtFMF) const;
+ bool canOptimizeWithRsq(FastMathFlags DivFMF, FastMathFlags SqrtFMF) const;
Value *optimizeWithRsq(IRBuilder<> &Builder, Value *Num, Value *Den,
FastMathFlags DivFMF, FastMathFlags SqrtFMF,
@@ -244,6 +236,14 @@ public:
FastMathFlags FMF) const;
Value *emitSqrtIEEE2ULP(IRBuilder<> &Builder, Value *Src,
FastMathFlags FMF) const;
+ Value *emitRsqF64(IRBuilder<> &Builder, Value *X, FastMathFlags SqrtFMF,
+ FastMathFlags DivFMF, const Instruction *CtxI,
+ bool IsNegative) const;
+
+ CallInst *createWorkitemIdX(IRBuilder<> &B) const;
+ void replaceWithWorkitemIdX(Instruction &I) const;
+ void replaceWithMaskedWorkitemIdX(Instruction &I, unsigned WaveSize) const;
+ bool tryReplaceWithWorkitemId(Instruction &I, unsigned Wave) const;
bool tryNarrowMathIfNoOverflow(Instruction *I);
@@ -260,6 +260,8 @@ public:
bool visitIntrinsicInst(IntrinsicInst &I);
bool visitFMinLike(IntrinsicInst &I);
bool visitSqrt(IntrinsicInst &I);
+ bool visitMbcntLo(IntrinsicInst &I) const;
+ bool visitMbcntHi(IntrinsicInst &I) const;
bool run();
};
@@ -304,16 +306,6 @@ bool AMDGPUCodeGenPrepareImpl::run() {
return MadeChange;
}
-bool AMDGPUCodeGenPrepareImpl::isSigned(const BinaryOperator &I) const {
- return I.getOpcode() == Instruction::AShr ||
- I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem;
-}
-
-bool AMDGPUCodeGenPrepareImpl::isSigned(const SelectInst &I) const {
- return isa<ICmpInst>(I.getOperand(0)) &&
- cast<ICmpInst>(I.getOperand(0))->isSigned();
-}
-
bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type *Ty) const {
return Ty->isFloatTy() || Ty->isDoubleTy() ||
(Ty->isHalfTy() && ST.has16BitInsts());
@@ -327,12 +319,16 @@ bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const {
return I.isSimple() && TySize < 32 && Alignment >= 4 && UA.isUniform(&I);
}
-unsigned AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op) const {
- return computeKnownBits(Op, DL, AC).countMaxActiveBits();
+unsigned
+AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op,
+ const Instruction *CtxI) const {
+ return computeKnownBits(Op, SQ.getWithInstruction(CtxI)).countMaxActiveBits();
}
-unsigned AMDGPUCodeGenPrepareImpl::numBitsSigned(Value *Op) const {
- return ComputeMaxSignificantBits(Op, DL, AC);
+unsigned
+AMDGPUCodeGenPrepareImpl::numBitsSigned(Value *Op,
+ const Instruction *CtxI) const {
+ return ComputeMaxSignificantBits(Op, SQ.DL, SQ.AC, CtxI, SQ.DT);
}
static void extractValues(IRBuilder<> &Builder,
@@ -383,12 +379,12 @@ bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const {
unsigned LHSBits = 0, RHSBits = 0;
bool IsSigned = false;
- if (ST.hasMulU24() && (LHSBits = numBitsUnsigned(LHS)) <= 24 &&
- (RHSBits = numBitsUnsigned(RHS)) <= 24) {
+ if (ST.hasMulU24() && (LHSBits = numBitsUnsigned(LHS, &I)) <= 24 &&
+ (RHSBits = numBitsUnsigned(RHS, &I)) <= 24) {
IsSigned = false;
- } else if (ST.hasMulI24() && (LHSBits = numBitsSigned(LHS)) <= 24 &&
- (RHSBits = numBitsSigned(RHS)) <= 24) {
+ } else if (ST.hasMulI24() && (LHSBits = numBitsSigned(LHS, &I)) <= 24 &&
+ (RHSBits = numBitsSigned(RHS, &I)) <= 24) {
IsSigned = true;
} else
@@ -623,15 +619,101 @@ static Value *emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src,
return Builder.CreateFMul(Rsq, OutputScaleFactor);
}
-bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator *SqrtOp,
- FastMathFlags DivFMF,
- FastMathFlags SqrtFMF) const {
- // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
- if (!DivFMF.allowContract() || !SqrtFMF.allowContract())
- return false;
+/// Emit inverse sqrt expansion for f64 with a correction sequence on top of
+/// v_rsq_f64. This should give a 1ulp result.
+Value *AMDGPUCodeGenPrepareImpl::emitRsqF64(IRBuilder<> &Builder, Value *X,
+ FastMathFlags SqrtFMF,
+ FastMathFlags DivFMF,
+ const Instruction *CtxI,
+ bool IsNegative) const {
+ // rsq(x):
+ // double y0 = BUILTIN_AMDGPU_RSQRT_F64(x);
+ // double e = MATH_MAD(-y0 * (x == PINF_F64 || x == 0.0 ? y0 : x), y0, 1.0);
+ // return MATH_MAD(y0*e, MATH_MAD(e, 0.375, 0.5), y0);
+ //
+ // -rsq(x):
+ // double y0 = BUILTIN_AMDGPU_RSQRT_F64(x);
+ // double e = MATH_MAD(-y0 * (x == PINF_F64 || x == 0.0 ? y0 : x), y0, 1.0);
+ // return MATH_MAD(-y0*e, MATH_MAD(e, 0.375, 0.5), -y0);
+ //
+ // The rsq instruction handles the special cases correctly. We need to check
+ // for the edge case conditions to ensure the special case propagates through
+ // the later instructions.
+
+ Value *Y0 = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, X);
+
+ // Try to elide the edge case check.
+ //
+ // Fast math flags imply:
+ // sqrt ninf => !isinf(x)
+ // fdiv ninf => x != 0, !isinf(x)
+ bool MaybePosInf = !SqrtFMF.noInfs() && !DivFMF.noInfs();
+ bool MaybeZero = !DivFMF.noInfs();
+
+ DenormalMode DenormMode;
+ FPClassTest Interested = fcNone;
+ if (MaybePosInf)
+ Interested = fcPosInf;
+ if (MaybeZero)
+ Interested |= fcZero;
+
+ if (Interested != fcNone) {
+ KnownFPClass KnownSrc = computeKnownFPClass(X, Interested, CtxI);
+ if (KnownSrc.isKnownNeverPosInfinity())
+ MaybePosInf = false;
+
+ DenormMode = F.getDenormalMode(X->getType()->getFltSemantics());
+ if (KnownSrc.isKnownNeverLogicalZero(DenormMode))
+ MaybeZero = false;
+ }
+
+ Value *SpecialOrRsq = X;
+ if (MaybeZero || MaybePosInf) {
+ Value *Cond;
+ if (MaybePosInf && MaybeZero) {
+ if (DenormMode.Input != DenormalMode::DenormalModeKind::Dynamic) {
+ FPClassTest TestMask = fcPosInf | fcZero;
+ if (DenormMode.inputsAreZero())
+ TestMask |= fcSubnormal;
- // v_rsq_f32 gives 1ulp
- return SqrtFMF.approxFunc() || SqrtOp->getFPAccuracy() >= 1.0f;
+ Cond = Builder.createIsFPClass(X, TestMask);
+ } else {
+ // Avoid using llvm.is.fpclass for dynamic denormal mode, since it
+ // doesn't respect the floating-point environment.
+ Value *IsZero =
+ Builder.CreateFCmpOEQ(X, ConstantFP::getZero(X->getType()));
+ Value *IsInf =
+ Builder.CreateFCmpOEQ(X, ConstantFP::getInfinity(X->getType()));
+ Cond = Builder.CreateOr(IsZero, IsInf);
+ }
+ } else if (MaybeZero) {
+ Cond = Builder.CreateFCmpOEQ(X, ConstantFP::getZero(X->getType()));
+ } else {
+ Cond = Builder.CreateFCmpOEQ(X, ConstantFP::getInfinity(X->getType()));
+ }
+
+ SpecialOrRsq = Builder.CreateSelect(Cond, Y0, X);
+ }
+
+ Value *NegY0 = Builder.CreateFNeg(Y0);
+ Value *NegXY0 = Builder.CreateFMul(SpecialOrRsq, NegY0);
+
+ // Could be fmuladd, but isFMAFasterThanFMulAndFAdd is always true for f64.
+ Value *E = Builder.CreateFMA(NegXY0, Y0, ConstantFP::get(X->getType(), 1.0));
+
+ Value *Y0E = Builder.CreateFMul(E, IsNegative ? NegY0 : Y0);
+
+ Value *EFMA = Builder.CreateFMA(E, ConstantFP::get(X->getType(), 0.375),
+ ConstantFP::get(X->getType(), 0.5));
+
+ return Builder.CreateFMA(Y0E, EFMA, IsNegative ? NegY0 : Y0);
+}
+
+bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(FastMathFlags DivFMF,
+ FastMathFlags SqrtFMF) const {
+ // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp for f32 and
+ // f64.
+ return DivFMF.allowContract() && SqrtFMF.allowContract();
}
Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
@@ -647,8 +729,6 @@ Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
if (!CLHS)
return nullptr;
- assert(Den->getType()->isFloatTy());
-
bool IsNegative = false;
// TODO: Handle other numerator values with arcp.
@@ -657,14 +737,20 @@ Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
IRBuilder<>::FastMathFlagGuard Guard(Builder);
Builder.setFastMathFlags(DivFMF | SqrtFMF);
- if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) ||
- canIgnoreDenormalInput(Den, CtxI)) {
- Value *Result = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, Den);
- // -1.0 / sqrt(x) -> fneg(rsq(x))
- return IsNegative ? Builder.CreateFNeg(Result) : Result;
+ if (Den->getType()->isFloatTy()) {
+ if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) ||
+ canIgnoreDenormalInput(Den, CtxI)) {
+ Value *Result =
+ Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, Den);
+ // -1.0 / sqrt(x) -> fneg(rsq(x))
+ return IsNegative ? Builder.CreateFNeg(Result) : Result;
+ }
+
+ return emitRsqIEEE1ULP(Builder, Den, IsNegative);
}
- return emitRsqIEEE1ULP(Builder, Den, IsNegative);
+ if (Den->getType()->isDoubleTy())
+ return emitRsqF64(Builder, Den, SqrtFMF, DivFMF, CtxI, IsNegative);
}
return nullptr;
@@ -776,6 +862,9 @@ Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
return Rsq;
}
+ if (!Num->getType()->isFloatTy())
+ return nullptr;
+
Value *Rcp = optimizeWithRcp(Builder, Num, Den, DivFMF, FDivInst);
if (Rcp)
return Rcp;
@@ -811,7 +900,8 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
return false;
Type *Ty = FDiv.getType()->getScalarType();
- if (!Ty->isFloatTy())
+ const bool IsFloat = Ty->isFloatTy();
+ if (!IsFloat && !Ty->isDoubleTy())
return false;
// The f64 rcp/rsq approximations are pretty inaccurate. We can do an
@@ -832,10 +922,14 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
DenII->hasOneUse()) {
const auto *SqrtOp = cast<FPMathOperator>(DenII);
SqrtFMF = SqrtOp->getFastMathFlags();
- if (canOptimizeWithRsq(SqrtOp, DivFMF, SqrtFMF))
+ if (canOptimizeWithRsq(DivFMF, SqrtFMF))
RsqOp = SqrtOp->getOperand(0);
}
+ // rcp path not yet implemented for f64.
+ if (!IsFloat && !RsqOp)
+ return false;
+
// Inaccurate rcp is allowed with afn.
//
// Defer to codegen to handle this.
@@ -850,7 +944,7 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
return false;
// Defer the correct implementations to codegen.
- if (ReqdAccuracy < 1.0f)
+ if (IsFloat && ReqdAccuracy < 1.0f)
return false;
IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));
@@ -929,13 +1023,13 @@ unsigned AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num,
Den->getType()->getScalarSizeInBits());
unsigned SSBits = Num->getType()->getScalarSizeInBits();
if (IsSigned) {
- unsigned RHSSignBits = ComputeNumSignBits(Den, DL, AC, &I);
+ unsigned RHSSignBits = ComputeNumSignBits(Den, SQ.DL, SQ.AC, &I, SQ.DT);
// A sign bit needs to be reserved for shrinking.
unsigned DivBits = SSBits - RHSSignBits + 1;
if (DivBits > MaxDivBits)
return SSBits;
- unsigned LHSSignBits = ComputeNumSignBits(Num, DL, AC, &I);
+ unsigned LHSSignBits = ComputeNumSignBits(Num, SQ.DL, SQ.AC, &I);
unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
DivBits = SSBits - SignBits + 1;
@@ -944,7 +1038,7 @@ unsigned AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num,
// All bits are used for unsigned division for Num or Den in range
// (SignedMax, UnsignedMax].
- KnownBits Known = computeKnownBits(Den, DL, AC, &I);
+ KnownBits Known = computeKnownBits(Den, SQ.getWithInstruction(&I));
if (Known.isNegative() || !Known.isNonNegative())
return SSBits;
unsigned RHSSignBits = Known.countMinLeadingZeros();
@@ -952,7 +1046,7 @@ unsigned AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num,
if (DivBits > MaxDivBits)
return SSBits;
- Known = computeKnownBits(Num, DL, AC, &I);
+ Known = computeKnownBits(Num, SQ.getWithInstruction(&I));
if (Known.isNegative() || !Known.isNonNegative())
return SSBits;
unsigned LHSSignBits = Known.countMinLeadingZeros();
@@ -1089,7 +1183,7 @@ bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &I,
// If there's no wider mulhi, there's only a better expansion for powers of
// two.
// TODO: Should really know for each vector element.
- if (isKnownToBeAPowerOfTwo(C, DL, true, AC, &I, DT))
+ if (isKnownToBeAPowerOfTwo(C, true, SQ.getWithInstruction(&I)))
return true;
return false;
@@ -1099,7 +1193,8 @@ bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &I,
// fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
if (BinOpDen->getOpcode() == Instruction::Shl &&
isa<Constant>(BinOpDen->getOperand(0)) &&
- isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), DL, true, AC, &I, DT)) {
+ isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), true,
+ SQ.getWithInstruction(&I))) {
return true;
}
}
@@ -1910,6 +2005,10 @@ bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
return visitFMinLike(I);
case Intrinsic::sqrt:
return visitSqrt(I);
+ case Intrinsic::amdgcn_mbcnt_lo:
+ return visitMbcntLo(I);
+ case Intrinsic::amdgcn_mbcnt_hi:
+ return visitMbcntHi(I);
default:
return false;
}
@@ -1984,7 +2083,7 @@ bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) {
// Match pattern for fract intrinsic in contexts where the nan check has been
// optimized out (and hope the knowledge the source can't be nan wasn't lost).
- if (!I.hasNoNaNs() && !isKnownNeverNaN(FractArg, SimplifyQuery(DL, TLI)))
+ if (!I.hasNoNaNs() && !isKnownNeverNaN(FractArg, SQ.getWithInstruction(&I)))
return false;
IRBuilder<> Builder(&I);
@@ -2090,6 +2189,110 @@ INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
false, false)
+/// Create a workitem.id.x intrinsic call with range metadata.
+CallInst *AMDGPUCodeGenPrepareImpl::createWorkitemIdX(IRBuilder<> &B) const {
+ CallInst *Tid = B.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_x, {});
+ ST.makeLIDRangeMetadata(Tid);
+ return Tid;
+}
+
+/// Replace the instruction with a direct workitem.id.x call.
+void AMDGPUCodeGenPrepareImpl::replaceWithWorkitemIdX(Instruction &I) const {
+ IRBuilder<> B(&I);
+ CallInst *Tid = createWorkitemIdX(B);
+ BasicBlock::iterator BI(&I);
+ ReplaceInstWithValue(BI, Tid);
+}
+
+/// Replace the instruction with (workitem.id.x & mask).
+void AMDGPUCodeGenPrepareImpl::replaceWithMaskedWorkitemIdX(
+ Instruction &I, unsigned WaveSize) const {
+ IRBuilder<> B(&I);
+ CallInst *Tid = createWorkitemIdX(B);
+ Constant *Mask = ConstantInt::get(Tid->getType(), WaveSize - 1);
+ Value *AndInst = B.CreateAnd(Tid, Mask);
+ BasicBlock::iterator BI(&I);
+ ReplaceInstWithValue(BI, AndInst);
+}
+
+/// Try to optimize mbcnt instruction by replacing with workitem.id.x when
+/// work group size allows direct computation of lane ID.
+/// Returns true if optimization was applied, false otherwise.
+bool AMDGPUCodeGenPrepareImpl::tryReplaceWithWorkitemId(Instruction &I,
+ unsigned Wave) const {
+ std::optional<unsigned> MaybeX = ST.getReqdWorkGroupSize(F, 0);
+ if (!MaybeX)
+ return false;
+
+ // When work group size == wave_size, each work group contains exactly one
+ // wave, so the instruction can be replaced with workitem.id.x directly.
+ if (*MaybeX == Wave) {
+ replaceWithWorkitemIdX(I);
+ return true;
+ }
+
+ // When work group evenly splits into waves, compute lane ID within wave
+ // using bit masking: lane_id = workitem.id.x & (wave_size - 1).
+ if (ST.hasWavefrontsEvenlySplittingXDim(F, /*RequiresUniformYZ=*/true)) {
+ replaceWithMaskedWorkitemIdX(I, Wave);
+ return true;
+ }
+
+ return false;
+}
+
+/// Optimize mbcnt.lo calls on wave32 architectures for lane ID computation.
+bool AMDGPUCodeGenPrepareImpl::visitMbcntLo(IntrinsicInst &I) const {
+ // This optimization only applies to wave32 targets where mbcnt.lo operates on
+ // the full execution mask.
+ if (!ST.isWave32())
+ return false;
+
+ // Only optimize the pattern mbcnt.lo(~0, 0) which counts active lanes with
+ // lower IDs.
+ if (!match(&I,
+ m_Intrinsic<Intrinsic::amdgcn_mbcnt_lo>(m_AllOnes(), m_Zero())))
+ return false;
+
+ return tryReplaceWithWorkitemId(I, ST.getWavefrontSize());
+}
+
+/// Optimize mbcnt.hi calls for lane ID computation.
+bool AMDGPUCodeGenPrepareImpl::visitMbcntHi(IntrinsicInst &I) const {
+ // Abort if wave size is not known at compile time.
+ if (!ST.isWaveSizeKnown())
+ return false;
+
+ unsigned Wave = ST.getWavefrontSize();
+
+ // On wave32, the upper 32 bits of execution mask are always 0, so
+ // mbcnt.hi(mask, val) always returns val unchanged.
+ if (ST.isWave32()) {
+ if (auto MaybeX = ST.getReqdWorkGroupSize(F, 0)) {
+ // Replace mbcnt.hi(mask, val) with val only when work group size matches
+ // wave size (single wave per work group).
+ if (*MaybeX == Wave) {
+ BasicBlock::iterator BI(&I);
+ ReplaceInstWithValue(BI, I.getArgOperand(1));
+ return true;
+ }
+ }
+ }
+
+ // Optimize the complete lane ID computation pattern:
+ // mbcnt.hi(~0, mbcnt.lo(~0, 0)) which counts all active lanes with lower IDs
+ // across the full execution mask.
+ using namespace PatternMatch;
+
+ // Check for pattern: mbcnt.hi(~0, mbcnt.lo(~0, 0))
+ if (!match(&I, m_Intrinsic<Intrinsic::amdgcn_mbcnt_hi>(
+ m_AllOnes(), m_Intrinsic<Intrinsic::amdgcn_mbcnt_lo>(
+ m_AllOnes(), m_Zero()))))
+ return false;
+
+ return tryReplaceWithWorkitemId(I, Wave);
+}
+
char AMDGPUCodeGenPrepare::ID = 0;
FunctionPass *llvm::createAMDGPUCodeGenPreparePass() {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index e8b211f..7f00ead 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -176,6 +176,19 @@ def binop_s64_with_s32_mask_combines : GICombineGroup<[
combine_or_s64_with_s32_mask, combine_and_s64_with_s32_mask
]>;
+// (or i64:x, (zext i32:y)) -> i64:(merge (or lo_32(x), i32:y), hi_32(x))
+// (or (zext i32:y), i64:x) -> i64:(merge (or lo_32(x), i32:y), hi_32(x))
+def or_s64_zext_s32_frag : GICombinePatFrag<(outs root:$dst), (ins $src_s64, $src_s32),
+ [(pattern (G_OR $dst, i64:$src_s64, i64:$zext_val), (G_ZEXT i64:$zext_val, i32:$src_s32)),
+ (pattern (G_OR $dst, i64:$zext_val, i64:$src_s64), (G_ZEXT i64:$zext_val, i32:$src_s32))]>;
+
+def combine_or_s64_s32 : GICombineRule<
+ (defs root:$dst),
+ (match (or_s64_zext_s32_frag $dst, i64:$x, i32:$y):$dst),
+ (apply (G_UNMERGE_VALUES $x_lo, $x_hi, $x),
+ (G_OR $or, $x_lo, $y),
+ (G_MERGE_VALUES $dst, $or, $x_hi))>;
+
let Predicates = [Has16BitInsts, NotHasMed3_16] in {
// For gfx8, expand f16-fmed3-as-f32 into a min/max f16 sequence. This
// saves one instruction compared to the promotion.
@@ -206,7 +219,7 @@ def AMDGPUPreLegalizerCombiner: GICombiner<
"AMDGPUPreLegalizerCombinerImpl",
[all_combines, combine_fmul_with_select_to_fldexp, clamp_i64_to_i16,
foldable_fneg, combine_shuffle_vector_to_build_vector,
- binop_s64_with_s32_mask_combines]> {
+ binop_s64_with_s32_mask_combines, combine_or_s64_s32]> {
let CombineAllMethodName = "tryCombineAllImpl";
}
@@ -215,7 +228,7 @@ def AMDGPUPostLegalizerCombiner: GICombiner<
[all_combines, gfx6gfx7_combines, gfx8_combines, combine_fmul_with_select_to_fldexp,
uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg,
rcp_sqrt_to_rsq, fdiv_by_sqrt_to_rsq_f16, sign_extension_in_reg, smulu64,
- binop_s64_with_s32_mask_combines]> {
+ binop_s64_with_s32_mask_combines, combine_or_s64_s32]> {
let CombineAllMethodName = "tryCombineAllImpl";
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
index d23521c..77be58c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
@@ -143,8 +143,7 @@ static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI,
}
static bool mayIgnoreSignedZero(MachineInstr &MI) {
- const TargetOptions &Options = MI.getMF()->getTarget().Options;
- return Options.NoSignedZerosFPMath || MI.getFlag(MachineInstr::MIFlag::FmNsz);
+ return MI.getFlag(MachineInstr::MIFlag::FmNsz);
}
static bool isInv2Pi(const APFloat &APF) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
index d14b5ce..f538769 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
@@ -7,13 +7,13 @@
//===----------------------------------------------------------------------===//
def FeatureFP64 : SubtargetFeature<"fp64",
- "FP64",
+ "HasFP64",
"true",
"Enable double precision operations"
>;
def FeatureFMA : SubtargetFeature<"fmaf",
- "FMA",
+ "HasFMA",
"true",
"Enable single precision FMA (not as fast as mul+add, but fused)"
>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index bb4bf74..cfef046 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -288,6 +288,8 @@ def : GINodeEquiv<G_AMDGPU_TBUFFER_STORE_FORMAT_D16, SItbuffer_store_d16>;
// FIXME: Check MMO is atomic
def : GINodeEquiv<G_ATOMICRMW_UINC_WRAP, atomic_load_uinc_wrap_glue>;
def : GINodeEquiv<G_ATOMICRMW_UDEC_WRAP, atomic_load_udec_wrap_glue>;
+def : GINodeEquiv<G_ATOMICRMW_USUB_COND, atomic_load_usub_cond_glue>;
+def : GINodeEquiv<G_ATOMICRMW_USUB_SAT, atomic_load_usub_sat_glue>;
def : GINodeEquiv<G_ATOMICRMW_FMIN, atomic_load_fmin_glue>;
def : GINodeEquiv<G_ATOMICRMW_FMAX, atomic_load_fmax_glue>;
@@ -308,6 +310,7 @@ def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMIN, SIbuffer_atomic_fmin>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMAX, SIbuffer_atomic_fmax>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_CMPSWAP, SIbuffer_atomic_cmpswap>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32, SIbuffer_atomic_cond_sub_u32>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32, SIbuffer_atomic_csub>;
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD, SIsbuffer_load>;
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_SBYTE, SIsbuffer_load_byte>;
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_UBYTE, SIsbuffer_load_ubyte>;
@@ -326,6 +329,12 @@ def : GINodeEquiv<G_AMDGPU_WHOLE_WAVE_FUNC_SETUP, AMDGPUwhole_wave_setup>;
// G_AMDGPU_WHOLE_WAVE_FUNC_RETURN is simpler than AMDGPUwhole_wave_return,
// so we don't mark it as equivalent.
+def : GINodeEquiv<G_AMDGPU_SPONENTRY, sponentry>;
+
+def : GINodeEquiv<G_AMDGPU_FLAT_LOAD_MONITOR, AMDGPUflat_load_monitor>;
+def : GINodeEquiv<G_AMDGPU_GLOBAL_LOAD_MONITOR, AMDGPUglobal_load_monitor>;
+
+
class GISelSop2Pat <
SDPatternOperator node,
Instruction inst,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
index 1b4b113..6bad4dbd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
@@ -131,7 +131,7 @@ protected:
public:
MetadataStreamerMsgPackV4() = default;
- ~MetadataStreamerMsgPackV4() = default;
+ ~MetadataStreamerMsgPackV4() override = default;
bool emitTo(AMDGPUTargetStreamer &TargetStreamer) override;
@@ -154,7 +154,7 @@ protected:
public:
MetadataStreamerMsgPackV5() = default;
- ~MetadataStreamerMsgPackV5() = default;
+ ~MetadataStreamerMsgPackV5() override = default;
};
class MetadataStreamerMsgPackV6 final : public MetadataStreamerMsgPackV5 {
@@ -163,7 +163,7 @@ protected:
public:
MetadataStreamerMsgPackV6() = default;
- ~MetadataStreamerMsgPackV6() = default;
+ ~MetadataStreamerMsgPackV6() override = default;
void emitKernelAttrs(const AMDGPUTargetMachine &TM, const MachineFunction &MF,
msgpack::MapDocNode Kern) override;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHazardLatency.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHazardLatency.cpp
new file mode 100644
index 0000000..37f8678
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHazardLatency.cpp
@@ -0,0 +1,77 @@
+//===--- AMDGPUHazardLatency.cpp - AMDGPU Hazard Latency Adjustment -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains a DAG scheduling mutation to adjust the
+/// latency of data edges between instructions which use registers
+/// potentially subject to additional hazard waits not accounted
+/// for in the normal scheduling model.
+/// While the scheduling model is typically still accurate in these
+/// scenarios, adjusting latency of relevant edges can improve wait
+/// merging and reduce pipeline impact of any required waits.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUHazardLatency.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+
+using namespace llvm;
+
+namespace {
+
+class HazardLatency : public ScheduleDAGMutation {
+private:
+ const GCNSubtarget &ST;
+ const SIRegisterInfo &TRI;
+ const MachineRegisterInfo &MRI;
+
+public:
+ HazardLatency(MachineFunction *MF)
+ : ST(MF->getSubtarget<GCNSubtarget>()), TRI(*ST.getRegisterInfo()),
+ MRI(MF->getRegInfo()) {}
+ void apply(ScheduleDAGInstrs *DAG) override;
+};
+
+void HazardLatency::apply(ScheduleDAGInstrs *DAG) {
+ constexpr unsigned MaskLatencyBoost = 3;
+
+ // Hazard only manifests in Wave64
+ if (!ST.hasVALUMaskWriteHazard() || !ST.isWave64())
+ return;
+
+ for (SUnit &SU : DAG->SUnits) {
+ const MachineInstr *MI = SU.getInstr();
+ if (!SIInstrInfo::isVALU(*MI))
+ continue;
+ if (MI->getOpcode() == AMDGPU::V_READLANE_B32 ||
+ MI->getOpcode() == AMDGPU::V_READFIRSTLANE_B32)
+ continue;
+ for (SDep &SuccDep : SU.Succs) {
+ if (SuccDep.isCtrl())
+ continue;
+ // Boost latency on VALU writes to SGPRs used by VALUs.
+ // Reduce risk of premature VALU pipeline stall on associated reads.
+ MachineInstr *DestMI = SuccDep.getSUnit()->getInstr();
+ if (!SIInstrInfo::isVALU(*DestMI))
+ continue;
+ Register Reg = SuccDep.getReg();
+ if (!TRI.isSGPRReg(MRI, Reg))
+ continue;
+ SuccDep.setLatency(SuccDep.getLatency() * MaskLatencyBoost);
+ }
+ }
+}
+
+} // end namespace
+
+std::unique_ptr<ScheduleDAGMutation>
+llvm::createAMDGPUHazardLatencyDAGMutation(MachineFunction *MF) {
+ return std::make_unique<HazardLatency>(MF);
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHazardLatency.h b/llvm/lib/Target/AMDGPU/AMDGPUHazardLatency.h
new file mode 100644
index 0000000..134cc27
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHazardLatency.h
@@ -0,0 +1,24 @@
+//===- AMDGPUHazardLatency.h - Hazard Latency Adjustment --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUHAZARDLATENCY_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUHAZARDLATENCY_H
+
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
+#include <memory>
+
+namespace llvm {
+
+class MachineFunction;
+
+std::unique_ptr<ScheduleDAGMutation>
+createAMDGPUHazardLatencyDAGMutation(MachineFunction *MF);
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUHAZARDLATENCY_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 5700468..10ffbe2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -24,6 +24,8 @@
#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/CodeGen/TargetOpcodes.h"
+#include <type_traits>
+
using namespace llvm;
#define DEBUG_TYPE "igrouplp"
@@ -1044,7 +1046,7 @@ private:
if (!SyncPipe.size())
return false;
- auto SuccSize = llvm::count_if(SU->Succs, [](const SDep &Succ) {
+ unsigned SuccSize = llvm::count_if(SU->Succs, [](const SDep &Succ) {
return Succ.getKind() == SDep::Data;
});
if (SuccSize >= Size)
@@ -1052,7 +1054,7 @@ private:
if (HasIntermediary) {
for (auto Succ : SU->Succs) {
- auto SuccSize =
+ unsigned SuccSize =
llvm::count_if(Succ.getSUnit()->Succs, [](const SDep &SuccSucc) {
return SuccSucc.getKind() == SDep::Data;
});
@@ -1084,7 +1086,7 @@ private:
if (!SyncPipe.size())
return false;
- auto SuccSize = llvm::count_if(SU->Succs, [](const SDep &Succ) {
+ unsigned SuccSize = llvm::count_if(SU->Succs, [](const SDep &Succ) {
return Succ.getKind() == SDep::Data;
});
if (SuccSize >= Size)
@@ -1092,7 +1094,7 @@ private:
if (HasIntermediary) {
for (auto Succ : SU->Succs) {
- auto SuccSize =
+ unsigned SuccSize =
llvm::count_if(Succ.getSUnit()->Succs, [](const SDep &SuccSucc) {
return SuccSucc.getKind() == SDep::Data;
});
@@ -1968,7 +1970,7 @@ private:
int NumBits = 0;
auto TRI = TII->getRegisterInfo();
- auto &MRI = MI->getParent()->getParent()->getRegInfo();
+ auto &MRI = MI->getMF()->getRegInfo();
for (auto &Elt : Collection) {
auto Op = Elt->getInstr()->getOperand(0);
auto Size =
@@ -2183,7 +2185,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
// Interleave MFMA with DS_READ prefetch
- for (unsigned I = 0; I < DSRCount - 4; ++I) {
+ for (unsigned I = 4; I < DSRCount; ++I) {
SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII);
SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
@@ -2196,7 +2198,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
// Phase 2a: Loop carried dependency with V_PERM
// Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they
// depend on. Interleave MFMA to keep XDL unit busy throughout.
- for (unsigned I = 0; I < DSWWithPermCount - DSWWithSharedVMEMCount; ++I) {
+ for (unsigned I = DSWWithSharedVMEMCount; I < DSWWithPermCount; ++I) {
SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
@@ -2233,7 +2235,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
// Phase 2b: Loop carried dependency without V_PERM
// Schedule DS_WRITE as closely as possible to the VMEM_READ they depend on.
// Interleave MFMA to keep XDL unit busy throughout.
- for (unsigned I = 0; I < DSWCount - DSWWithPermCount; I++) {
+ for (unsigned I = DSWWithPermCount; I < DSWCount; I++) {
SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
@@ -2391,6 +2393,61 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
if (MI.isMetaInstruction())
Result = false;
+ else if (MI.isInlineAsm()) {
+ const SIRegisterInfo &TRI = TII->getRegisterInfo();
+ auto &MRI = MI.getParent()->getParent()->getRegInfo();
+ bool SGPR_used = false, SGPR_big_def = false, VGPR_used = false,
+ VMFMA_used = false, VReg32_used = false, MayLoad = MI.mayLoad(),
+ MayStore = MI.mayStore();
+ for (const MachineOperand &Operand : MI.operands())
+ if (Operand.isReg()) {
+ const TargetRegisterClass &RegClass =
+ *TRI.getRegClassForOperandReg(MRI, Operand);
+ if (TRI.hasVGPRs(&RegClass)) {
+ VGPR_used = true;
+ if (Operand.isUse() && TRI.getRegSizeInBits(RegClass) == 32)
+ VReg32_used = true;
+ }
+ // > 128 bit registers are usually only used by MFMA instructions, so
+ // we're using that as a heuristic to guess the schedule group mask of
+ // the inline asm.
+ if (TRI.hasAGPRs(&RegClass) || TRI.getRegSizeInBits(RegClass) > 128)
+ VMFMA_used = true;
+ if (TRI.hasSGPRs(&RegClass))
+ SGPR_used = true;
+ if (TRI.getRegSizeInBits(RegClass) > 64 && Operand.isDef())
+ SGPR_big_def = true;
+ }
+
+ typedef std::underlying_type_t<SchedGroupMask> SGMask_t;
+ SGMask_t InlineAsmMask = 0;
+ if (VGPR_used && !VMFMA_used && !MayLoad && !MayStore)
+ InlineAsmMask |= (SGMask_t)SchedGroupMask::VALU;
+ if (SGPR_used && !VGPR_used && !MayLoad && !MayStore)
+ InlineAsmMask |= (SGMask_t)SchedGroupMask::SALU;
+ if (VMFMA_used)
+ InlineAsmMask |= (SGMask_t)SchedGroupMask::MFMA;
+ if (VGPR_used && MayLoad)
+ InlineAsmMask |= (SGMask_t)(VReg32_used ? SchedGroupMask::DS_READ
+ : SchedGroupMask::VMEM_READ);
+ if (VGPR_used && MayStore)
+ InlineAsmMask |= (SGMask_t)(VReg32_used ? SchedGroupMask::DS_WRITE
+ : SchedGroupMask::VMEM_WRITE);
+ if (SGPR_big_def)
+ InlineAsmMask |= (SGMask_t)SchedGroupMask::DS_READ;
+ if (InlineAsmMask & (SGMask_t)SchedGroupMask::VALU ||
+ InlineAsmMask & (SGMask_t)SchedGroupMask::SALU)
+ InlineAsmMask |= (SGMask_t)SchedGroupMask::ALU;
+ if (InlineAsmMask & (SGMask_t)SchedGroupMask::DS_READ ||
+ InlineAsmMask & (SGMask_t)SchedGroupMask::DS_WRITE)
+ InlineAsmMask |= (SGMask_t)SchedGroupMask::DS;
+ if (InlineAsmMask & (SGMask_t)SchedGroupMask::VMEM_READ ||
+ InlineAsmMask & (SGMask_t)SchedGroupMask::VMEM_WRITE)
+ InlineAsmMask |= (SGMask_t)SchedGroupMask::VMEM;
+
+ Result = ((SGMask_t)SGMask & InlineAsmMask) != 0;
+ }
+
else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
(TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI) ||
TII->isTRANS(MI)))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
index aff7096..0688f07 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
@@ -11,7 +11,6 @@
#include "llvm/CodeGen/ScheduleDAGMutation.h"
#include <memory>
-#include <vector>
namespace llvm {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index b8b419d..238f06f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -134,7 +134,7 @@ static SDValue stripExtractLoElt(SDValue In) {
INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISelLegacy, "amdgpu-isel",
"AMDGPU DAG->DAG Pattern Instruction Selection", false,
false)
-INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo)
+INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfoWrapperLegacy)
INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysisLegacy)
INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
#ifdef EXPENSIVE_CHECKS
@@ -238,7 +238,7 @@ bool AMDGPUDAGToDAGISelLegacy::runOnMachineFunction(MachineFunction &MF) {
}
void AMDGPUDAGToDAGISelLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
- AU.addRequired<AMDGPUArgumentUsageInfo>();
+ AU.addRequired<AMDGPUArgumentUsageInfoWrapperLegacy>();
AU.addRequired<UniformityInfoWrapperPass>();
#ifdef EXPENSIVE_CHECKS
AU.addRequired<DominatorTreeWrapperPass>();
@@ -470,6 +470,24 @@ MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
}
+SDNode *AMDGPUDAGToDAGISel::packConstantV2I16(const SDNode *N,
+ SelectionDAG &DAG) const {
+ // TODO: Handle undef as zero
+
+ assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2);
+ uint32_t LHSVal, RHSVal;
+ if (getConstantValue(N->getOperand(0), LHSVal) &&
+ getConstantValue(N->getOperand(1), RHSVal)) {
+ SDLoc SL(N);
+ uint32_t K = (LHSVal & 0xffff) | (RHSVal << 16);
+ return DAG.getMachineNode(
+ isVGPRImm(N) ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32, SL,
+ N->getValueType(0), DAG.getTargetConstant(K, SL, MVT::i32));
+ }
+
+ return nullptr;
+}
+
void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
EVT VT = N->getValueType(0);
unsigned NumVectorElts = VT.getVectorNumElements();
@@ -708,10 +726,14 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
break;
}
+ const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
assert(VT.getVectorElementType().bitsEq(MVT::i32));
- unsigned RegClassID =
- SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID();
- SelectBuildVector(N, RegClassID);
+ const TargetRegisterClass *RegClass =
+ N->isDivergent()
+ ? TRI->getDefaultVectorSuperClassForBitWidth(NumVectorElts * 32)
+ : SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32);
+
+ SelectBuildVector(N, RegClass->getID());
return;
}
case ISD::VECTOR_SHUFFLE:
@@ -1284,7 +1306,7 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
// FIXME: Select to VOP3 version for with-carry.
unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
- if (Subtarget->hasAddNoCarry()) {
+ if (Subtarget->hasAddNoCarryInsts()) {
SubOp = AMDGPU::V_SUB_U32_e64;
Opnds.push_back(
CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
@@ -1469,7 +1491,7 @@ bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
Opnds.push_back(Zero);
Opnds.push_back(Addr.getOperand(1));
unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
- if (Subtarget->hasAddNoCarry()) {
+ if (Subtarget->hasAddNoCarryInsts()) {
SubOp = AMDGPU::V_SUB_U32_e64;
Opnds.push_back(
CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
@@ -1828,72 +1850,83 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
isFlatScratchBaseLegal(Addr))) {
int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
- const SIInstrInfo *TII = Subtarget->getInstrInfo();
- if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
- Addr = N0;
- OffsetVal = COffsetVal;
- } else {
- // If the offset doesn't fit, put the low bits into the offset field and
- // add the rest.
- //
- // For a FLAT instruction the hardware decides whether to access
- // global/scratch/shared memory based on the high bits of vaddr,
- // ignoring the offset field, so we have to ensure that when we add
- // remainder to vaddr it still points into the same underlying object.
- // The easiest way to do that is to make sure that we split the offset
- // into two pieces that are both >= 0 or both <= 0.
-
- SDLoc DL(N);
- uint64_t RemainderOffset;
-
- std::tie(OffsetVal, RemainderOffset) =
- TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
-
- SDValue AddOffsetLo =
- getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
- SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
-
- if (Addr.getValueType().getSizeInBits() == 32) {
- SmallVector<SDValue, 3> Opnds;
- Opnds.push_back(N0);
- Opnds.push_back(AddOffsetLo);
- unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
- if (Subtarget->hasAddNoCarry()) {
- AddOp = AMDGPU::V_ADD_U32_e64;
- Opnds.push_back(Clamp);
- }
- Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
+ // Adding the offset to the base address in a FLAT instruction must not
+ // change the memory aperture in which the address falls. Therefore we can
+ // only fold offsets from inbounds GEPs into FLAT instructions.
+ bool IsInBounds =
+ Addr.getOpcode() == ISD::PTRADD && Addr->getFlags().hasInBounds();
+ if (COffsetVal == 0 || FlatVariant != SIInstrFlags::FLAT || IsInBounds) {
+ const SIInstrInfo *TII = Subtarget->getInstrInfo();
+ if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
+ Addr = N0;
+ OffsetVal = COffsetVal;
} else {
- // TODO: Should this try to use a scalar add pseudo if the base address
- // is uniform and saddr is usable?
- SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
- SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
-
- SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
- DL, MVT::i32, N0, Sub0);
- SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
- DL, MVT::i32, N0, Sub1);
-
- SDValue AddOffsetHi =
- getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
-
- SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
-
- SDNode *Add =
- CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
- {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
-
- SDNode *Addc = CurDAG->getMachineNode(
- AMDGPU::V_ADDC_U32_e64, DL, VTs,
- {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
-
- SDValue RegSequenceArgs[] = {
- CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
- SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
-
- Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
- MVT::i64, RegSequenceArgs),
- 0);
+ // If the offset doesn't fit, put the low bits into the offset field
+ // and add the rest.
+ //
+ // For a FLAT instruction the hardware decides whether to access
+ // global/scratch/shared memory based on the high bits of vaddr,
+ // ignoring the offset field, so we have to ensure that when we add
+ // remainder to vaddr it still points into the same underlying object.
+ // The easiest way to do that is to make sure that we split the offset
+ // into two pieces that are both >= 0 or both <= 0.
+
+ SDLoc DL(N);
+ uint64_t RemainderOffset;
+
+ std::tie(OffsetVal, RemainderOffset) =
+ TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
+
+ SDValue AddOffsetLo =
+ getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
+ SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
+
+ if (Addr.getValueType().getSizeInBits() == 32) {
+ SmallVector<SDValue, 3> Opnds;
+ Opnds.push_back(N0);
+ Opnds.push_back(AddOffsetLo);
+ unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
+ if (Subtarget->hasAddNoCarryInsts()) {
+ AddOp = AMDGPU::V_ADD_U32_e64;
+ Opnds.push_back(Clamp);
+ }
+ Addr =
+ SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
+ } else {
+ // TODO: Should this try to use a scalar add pseudo if the base
+ // address is uniform and saddr is usable?
+ SDValue Sub0 =
+ CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
+ SDValue Sub1 =
+ CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
+
+ SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+ DL, MVT::i32, N0, Sub0);
+ SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+ DL, MVT::i32, N0, Sub1);
+
+ SDValue AddOffsetHi =
+ getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
+
+ SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
+
+ SDNode *Add =
+ CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
+ {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
+
+ SDNode *Addc = CurDAG->getMachineNode(
+ AMDGPU::V_ADDC_U32_e64, DL, VTs,
+ {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
+
+ SDValue RegSequenceArgs[] = {
+ CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL,
+ MVT::i32),
+ SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
+
+ Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
+ MVT::i64, RegSequenceArgs),
+ 0);
+ }
}
}
}
@@ -3047,9 +3080,38 @@ void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
const unsigned Opc = gwsIntrinToOpcode(IntrID);
+
+ const MCInstrDesc &InstrDesc = TII->get(Opc);
+ int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
+
+ const TargetRegisterClass *DataRC = TII->getRegClass(InstrDesc, Data0Idx);
+
SmallVector<SDValue, 5> Ops;
- if (HasVSrc)
- Ops.push_back(N->getOperand(2));
+ if (HasVSrc) {
+ const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
+
+ SDValue Data = N->getOperand(2);
+ MVT DataVT = Data.getValueType().getSimpleVT();
+ if (TRI->isTypeLegalForClass(*DataRC, DataVT)) {
+ // Normal 32-bit case.
+ Ops.push_back(N->getOperand(2));
+ } else {
+ // Operand is really 32-bits, but requires 64-bit alignment, so use the
+ // even aligned 64-bit register class.
+ const SDValue RegSeqOps[] = {
+ CurDAG->getTargetConstant(DataRC->getID(), SL, MVT::i32), Data,
+ CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
+ SDValue(
+ CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL, MVT::i32),
+ 0),
+ CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32)};
+
+ Ops.push_back(SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,
+ SL, MVT::v2i32, RegSeqOps),
+ 0));
+ }
+ }
+
Ops.push_back(OffsetField);
Ops.push_back(Chain);
@@ -4222,7 +4284,7 @@ static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,
SmallVector<SDValue, 3> Backup(Src.begin(), Src.end());
if (!getOperandBits(LHS, LHSBits) ||
!getOperandBits(RHS, RHSBits)) {
- Src = Backup;
+ Src = std::move(Backup);
return std::make_pair(0, 0);
}
@@ -4387,16 +4449,23 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {
const auto *Ld = cast<LoadSDNode>(N);
-
const MachineMemOperand *MMO = Ld->getMemOperand();
- if (N->isDivergent() && !AMDGPU::isUniformMMO(MMO))
+
+ // FIXME: We ought to able able to take the direct isDivergent result. We
+ // cannot rely on the MMO for a uniformity check, and should stop using
+ // it. This is a hack for 2 ways that the IR divergence analysis is superior
+ // to the DAG divergence: Recognizing shift-of-workitem-id as always
+ // uniform, and isSingleLaneExecution. These should be handled in the DAG
+ // version, and then this can be dropped.
+ if (Ld->isDivergent() && !AMDGPU::isUniformMMO(MMO))
return false;
return MMO->getSize().hasValue() &&
Ld->getAlign() >=
Align(std::min(MMO->getSize().getValue().getKnownMinValue(),
uint64_t(4))) &&
- ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
+ (MMO->isInvariant() ||
+ (Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
(Subtarget->getScalarizeGlobalBehavior() &&
Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 4fa0d3f..a86b754 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -14,6 +14,7 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUISELDAGTODAG_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUISELDAGTODAG_H
+#include "AMDGPUSelectionDAGInfo.h"
#include "GCNSubtarget.h"
#include "SIMachineFunctionInfo.h"
#include "SIModeRegisterDefaults.h"
@@ -45,21 +46,6 @@ static inline bool getConstantValue(SDValue N, uint32_t &Out) {
return false;
}
-// TODO: Handle undef as zero
-static inline SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG) {
- assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2);
- uint32_t LHSVal, RHSVal;
- if (getConstantValue(N->getOperand(0), LHSVal) &&
- getConstantValue(N->getOperand(1), RHSVal)) {
- SDLoc SL(N);
- uint32_t K = (LHSVal & 0xffff) | (RHSVal << 16);
- return DAG.getMachineNode(AMDGPU::S_MOV_B32, SL, N->getValueType(0),
- DAG.getTargetConstant(K, SL, MVT::i32));
- }
-
- return nullptr;
-}
-
/// AMDGPU specific code to select AMDGPU machine instructions for
/// SelectionDAG operations.
class AMDGPUDAGToDAGISel : public SelectionDAGISel {
@@ -115,6 +101,8 @@ private:
MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const;
+ SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG) const;
+
SDNode *glueCopyToOp(SDNode *N, SDValue NewChain, SDValue Glue) const;
SDNode *glueCopyToM0(SDNode *N, SDValue Val) const;
SDNode *glueCopyToM0LDSInit(SDNode *N) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 1b559a6..da21033 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -17,6 +17,7 @@
#include "AMDGPUInstrInfo.h"
#include "AMDGPUMachineFunction.h"
#include "AMDGPUMemoryUtils.h"
+#include "AMDGPUSelectionDAGInfo.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
@@ -59,8 +60,9 @@ unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
}
AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
- const AMDGPUSubtarget &STI)
- : TargetLowering(TM), Subtarget(&STI) {
+ const TargetSubtargetInfo &STI,
+ const AMDGPUSubtarget &AMDGPUSTI)
+ : TargetLowering(TM, STI), Subtarget(&AMDGPUSTI) {
// Always lower memset, memcpy, and memmove intrinsics to load/store
// instructions, rather then generating calls to memset, mempcy or memmove.
MaxStoresPerMemset = MaxStoresPerMemsetOptSize = ~0U;
@@ -336,6 +338,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Expand);
setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
+ setTruncStoreAction(MVT::v6f32, MVT::v6f16, Expand);
setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Expand);
setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
setTruncStoreAction(MVT::v16f32, MVT::v16bf16, Expand);
@@ -424,22 +427,13 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
Expand);
setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Expand);
-
- if (Subtarget->has16BitInsts()) {
- setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
- setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Legal);
- } else {
- setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
- setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Custom);
- }
+ setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
+ setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Custom);
setOperationAction({ISD::FLOG10, ISD::FLOG, ISD::FEXP, ISD::FEXP10}, MVT::f16,
Custom);
setOperationAction(ISD::FCANONICALIZE, {MVT::f32, MVT::f64}, Legal);
- if (Subtarget->has16BitInsts()) {
- setOperationAction(ISD::FCANONICALIZE, MVT::f16, Legal);
- }
// FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
// scalarization code. Can be removed when IS_FPCLASS expand isn't called by
@@ -451,11 +445,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
MVT::v16f64},
Custom);
- if (isTypeLegal(MVT::f16))
- setOperationAction(ISD::IS_FPCLASS,
- {MVT::v2f16, MVT::v3f16, MVT::v4f16, MVT::v16f16},
- Custom);
-
// Expand to fneg + fadd.
setOperationAction(ISD::FSUB, MVT::f64, Expand);
@@ -478,7 +467,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64},
Custom);
- setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
+ setOperationAction({ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP}, MVT::f64,
+ Expand);
setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
@@ -502,16 +492,15 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
// The hardware supports 32-bit FSHR, but not FSHL.
setOperationAction(ISD::FSHR, MVT::i32, Legal);
- // The hardware supports 32-bit ROTR, but not ROTL.
- setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand);
- setOperationAction(ISD::ROTR, MVT::i64, Expand);
+ setOperationAction({ISD::ROTL, ISD::ROTR}, {MVT::i32, MVT::i64}, Expand);
setOperationAction({ISD::MULHU, ISD::MULHS}, MVT::i16, Expand);
setOperationAction({ISD::MUL, ISD::MULHU, ISD::MULHS}, MVT::i64, Expand);
- setOperationAction(
- {ISD::UINT_TO_FP, ISD::SINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT},
- MVT::i64, Custom);
+ setOperationAction({ISD::UINT_TO_FP, ISD::SINT_TO_FP, ISD::FP_TO_SINT,
+ ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
+ ISD::FP_TO_UINT_SAT},
+ MVT::i64, Custom);
setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i32,
@@ -530,19 +519,28 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
for (MVT VT : VectorIntTypes) {
// Expand the following operations for the current type by default.
- setOperationAction({ISD::ADD, ISD::AND, ISD::FP_TO_SINT,
- ISD::FP_TO_UINT, ISD::MUL, ISD::MULHU,
- ISD::MULHS, ISD::OR, ISD::SHL,
- ISD::SRA, ISD::SRL, ISD::ROTL,
- ISD::ROTR, ISD::SUB, ISD::SINT_TO_FP,
- ISD::UINT_TO_FP, ISD::SDIV, ISD::UDIV,
- ISD::SREM, ISD::UREM, ISD::SMUL_LOHI,
- ISD::UMUL_LOHI, ISD::SDIVREM, ISD::UDIVREM,
- ISD::SELECT, ISD::VSELECT, ISD::SELECT_CC,
- ISD::XOR, ISD::BSWAP, ISD::CTPOP,
- ISD::CTTZ, ISD::CTLZ, ISD::VECTOR_SHUFFLE,
- ISD::SETCC, ISD::ADDRSPACECAST},
+ // clang-format off
+ setOperationAction({ISD::ADD, ISD::AND,
+ ISD::FP_TO_SINT, ISD::FP_TO_UINT,
+ ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT,
+ ISD::MUL, ISD::MULHU,
+ ISD::MULHS, ISD::OR,
+ ISD::SHL, ISD::SRA,
+ ISD::SRL, ISD::ROTL,
+ ISD::ROTR, ISD::SUB,
+ ISD::SINT_TO_FP, ISD::UINT_TO_FP,
+ ISD::SDIV, ISD::UDIV,
+ ISD::SREM, ISD::UREM,
+ ISD::SMUL_LOHI, ISD::UMUL_LOHI,
+ ISD::SDIVREM, ISD::UDIVREM,
+ ISD::SELECT, ISD::VSELECT,
+ ISD::SELECT_CC, ISD::XOR,
+ ISD::BSWAP, ISD::CTPOP,
+ ISD::CTTZ, ISD::CTLZ,
+ ISD::VECTOR_SHUFFLE, ISD::SETCC,
+ ISD::ADDRSPACECAST},
VT, Expand);
+ // clang-format on
}
static const MVT::SimpleValueType FloatVectorTypes[] = {
@@ -643,9 +641,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
}
bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const {
- if (getTargetMachine().Options.NoSignedZerosFPMath)
- return true;
-
const auto Flags = Op.getNode()->getFlags();
if (Flags.hasNoSignedZeros())
return true;
@@ -820,9 +815,7 @@ bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
// FIXME: Why are we reporting vectors of FP immediates as legal?
bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
bool ForCodeSize) const {
- EVT ScalarVT = VT.getScalarType();
- return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
- (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
+ return isTypeLegal(VT.getScalarType());
}
// We don't want to shrink f64 / f32 constants.
@@ -966,8 +959,8 @@ bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
assert(VT.isFloatingPoint());
// Packed operations do not have a fabs modifier.
- return VT == MVT::f32 || VT == MVT::f64 ||
- (Subtarget->has16BitInsts() && (VT == MVT::f16 || VT == MVT::bf16));
+ // Report this based on the end legalized type.
+ return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
}
bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
@@ -1056,8 +1049,9 @@ bool AMDGPUTargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT,
case ISD::SMAX:
case ISD::UMIN:
case ISD::UMAX:
- if (Subtarget->has16BitInsts() &&
- (!DestVT.isVector() || !Subtarget->hasVOP3PInsts())) {
+ if (isTypeLegal(MVT::i16) &&
+ (!DestVT.isVector() ||
+ !isOperationLegal(ISD::ADD, MVT::v2i16))) { // Check if VOP3P
// Don't narrow back down to i16 if promoted to i32 already.
if (!N->isDivergent() && DestVT.isInteger() &&
DestVT.getScalarSizeInBits() > 1 &&
@@ -1216,9 +1210,8 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
const SmallVectorImpl<ISD::InputArg> &Ins) const {
const MachineFunction &MF = State.getMachineFunction();
const Function &Fn = MF.getFunction();
- LLVMContext &Ctx = Fn.getParent()->getContext();
- const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
- const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset();
+ LLVMContext &Ctx = Fn.getContext();
+ const unsigned ExplicitOffset = Subtarget->getExplicitKernelArgOffset();
CallingConv::ID CC = Fn.getCallingConv();
Align MaxAlign = Align(1);
@@ -1248,7 +1241,8 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
SmallVector<EVT, 16> ValueVTs;
SmallVector<uint64_t, 16> Offsets;
- ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
+ ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, /*MemVTs=*/nullptr,
+ &Offsets, ArgOffset);
for (unsigned Value = 0, NumValues = ValueVTs.size();
Value != NumValues; ++Value) {
@@ -1409,7 +1403,12 @@ SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI,
InVals.push_back(DAG.getPOISON(Arg.VT));
}
- return DAG.getEntryNode();
+ // FIXME: Hack because R600 doesn't handle callseq pseudos yet.
+ if (getTargetMachine().getTargetTriple().getArch() == Triple::r600)
+ return CLI.Chain;
+
+ SDValue Chain = DAG.getCALLSEQ_START(CLI.Chain, 0, 0, CLI.DL);
+ return DAG.getCALLSEQ_END(Chain, 0, 0, /*InGlue=*/SDValue(), CLI.DL);
}
SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
@@ -1465,6 +1464,9 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT:
return LowerFP_TO_INT(Op, DAG);
+ case ISD::FP_TO_SINT_SAT:
+ case ISD::FP_TO_UINT_SAT:
+ return LowerFP_TO_INT_SAT(Op, DAG);
case ISD::CTTZ:
case ISD::CTTZ_ZERO_UNDEF:
case ISD::CTLZ:
@@ -1528,7 +1530,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
if (std::optional<uint32_t> Address =
AMDGPUMachineFunction::getLDSAbsoluteAddress(*GV)) {
if (IsNamedBarrier) {
- unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
+ unsigned BarCnt = cast<GlobalVariable>(GV)->getGlobalSize(DL) / 16;
MFI->recordNumNamedBarriers(Address.value(), BarCnt);
}
return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
@@ -1885,14 +1887,14 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
Align BaseAlign = Load->getAlign();
Align HiAlign = commonAlignment(BaseAlign, Size);
- SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
- Load->getChain(), BasePtr, SrcValue, LoMemVT,
- BaseAlign, Load->getMemOperand()->getFlags());
+ SDValue LoLoad = DAG.getExtLoad(
+ Load->getExtensionType(), SL, LoVT, Load->getChain(), BasePtr, SrcValue,
+ LoMemVT, BaseAlign, Load->getMemOperand()->getFlags(), Load->getAAInfo());
SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size));
- SDValue HiLoad =
- DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
- HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
- HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
+ SDValue HiLoad = DAG.getExtLoad(
+ Load->getExtensionType(), SL, HiVT, Load->getChain(), HiPtr,
+ SrcValue.getWithOffset(LoMemVT.getStoreSize()), HiMemVT, HiAlign,
+ Load->getMemOperand()->getFlags(), Load->getAAInfo());
SDValue Join;
if (LoVT == HiVT) {
@@ -1980,10 +1982,10 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
SDValue LoStore =
DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
- Store->getMemOperand()->getFlags());
- SDValue HiStore =
- DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
- HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
+ Store->getMemOperand()->getFlags(), Store->getAAInfo());
+ SDValue HiStore = DAG.getTruncStore(
+ Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size), HiMemVT, HiAlign,
+ Store->getMemOperand()->getFlags(), Store->getAAInfo());
return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
}
@@ -2628,11 +2630,18 @@ static bool valueIsKnownNeverF32Denorm(SDValue Src) {
return Src.getOperand(0).getValueType() == MVT::f16;
case ISD::FP16_TO_FP:
case ISD::FFREXP:
+ case ISD::FSQRT:
+ case AMDGPUISD::LOG:
+ case AMDGPUISD::EXP:
return true;
case ISD::INTRINSIC_WO_CHAIN: {
unsigned IntrinsicID = Src.getConstantOperandVal(0);
switch (IntrinsicID) {
case Intrinsic::amdgcn_frexp_mant:
+ case Intrinsic::amdgcn_log:
+ case Intrinsic::amdgcn_log_clamp:
+ case Intrinsic::amdgcn_exp2:
+ case Intrinsic::amdgcn_sqrt:
return true;
default:
return false;
@@ -2731,7 +2740,7 @@ SDValue AMDGPUTargetLowering::LowerFLOG2(SDValue Op, SelectionDAG &DAG) const {
if (VT == MVT::f16) {
// Nothing in half is a denormal when promoted to f32.
- assert(!Subtarget->has16BitInsts());
+ assert(!isTypeLegal(VT));
SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags);
return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
@@ -2764,20 +2773,18 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op,
EVT VT = Op.getValueType();
SDNodeFlags Flags = Op->getFlags();
SDLoc DL(Op);
-
const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
- const auto &Options = getTargetMachine().Options;
if (VT == MVT::f16 || Flags.hasApproximateFuncs()) {
- if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
+ if (VT == MVT::f16 && !isTypeLegal(MVT::f16)) {
// Log and multiply in f32 is good enough for f16.
X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
}
SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags);
- if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
+ if (VT == MVT::f16 && !isTypeLegal(MVT::f16)) {
return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered,
DAG.getTargetConstant(0, DL, MVT::i32), Flags);
}
@@ -2803,7 +2810,9 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op,
SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT);
SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT);
-
+ // This adds correction terms for which contraction may lead to an increase
+ // in the error of the approximation, so disable it.
+ Flags.setAllowContract(false);
R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags);
SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags);
SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags);
@@ -2826,15 +2835,16 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op,
SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst);
SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt);
SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags);
-
+ // This adds correction terms for which contraction may lead to an increase
+ // in the error of the approximation, so disable it.
+ Flags.setAllowContract(false);
SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags);
SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags);
SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags);
R = getMad(DAG, DL, VT, YH, CH, Mad1);
}
- const bool IsFiniteOnly =
- (Flags.hasNoNaNs() || Options.NoNaNsFPMath) && Flags.hasNoInfs();
+ const bool IsFiniteOnly = Flags.hasNoNaNs() && Flags.hasNoInfs();
// TODO: Check if known finite from source value.
if (!IsFiniteOnly) {
@@ -2910,7 +2920,7 @@ SDValue AMDGPUTargetLowering::lowerFEXP2(SDValue Op, SelectionDAG &DAG) const {
if (VT == MVT::f16) {
// Nothing in half is a denormal when promoted to f32.
- assert(!Subtarget->has16BitInsts());
+ assert(!isTypeLegal(MVT::f16));
SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags);
return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
@@ -2950,19 +2960,28 @@ SDValue AMDGPUTargetLowering::lowerFEXP2(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags);
}
+SDValue AMDGPUTargetLowering::lowerFEXPUnsafeImpl(SDValue X, const SDLoc &SL,
+ SelectionDAG &DAG,
+ SDNodeFlags Flags,
+ bool IsExp10) const {
+ // exp(x) -> exp2(M_LOG2E_F * x);
+ // exp10(x) -> exp2(log2(10) * x);
+ EVT VT = X.getValueType();
+ SDValue Const =
+ DAG.getConstantFP(IsExp10 ? 0x1.a934f0p+1f : numbers::log2e, SL, VT);
+
+ SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Const, Flags);
+ return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
+ : (unsigned)ISD::FEXP2,
+ SL, VT, Mul, Flags);
+}
+
SDValue AMDGPUTargetLowering::lowerFEXPUnsafe(SDValue X, const SDLoc &SL,
SelectionDAG &DAG,
SDNodeFlags Flags) const {
EVT VT = X.getValueType();
- const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);
-
- if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
- // exp2(M_LOG2E_F * f);
- SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Log2E, Flags);
- return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
- : (unsigned)ISD::FEXP2,
- SL, VT, Mul, Flags);
- }
+ if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags))
+ return lowerFEXPUnsafeImpl(X, SL, DAG, Flags, /*IsExp10=*/false);
EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
@@ -2976,6 +2995,7 @@ SDValue AMDGPUTargetLowering::lowerFEXPUnsafe(SDValue X, const SDLoc &SL,
SDValue AdjustedX =
DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
+ const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);
SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags);
SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags);
@@ -2994,6 +3014,7 @@ SDValue AMDGPUTargetLowering::lowerFEXP10Unsafe(SDValue X, const SDLoc &SL,
SelectionDAG &DAG,
SDNodeFlags Flags) const {
const EVT VT = X.getValueType();
+
const unsigned Exp2Op = VT == MVT::f32 ? static_cast<unsigned>(AMDGPUISD::EXP)
: static_cast<unsigned>(ISD::FEXP2);
@@ -3050,33 +3071,32 @@ SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
SDNodeFlags Flags = Op->getFlags();
const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
- if (VT.getScalarType() == MVT::f16) {
- // v_exp_f16 (fmul x, log2e)
- if (allowApproxFunc(DAG, Flags)) // TODO: Does this really require fast?
- return lowerFEXPUnsafe(X, SL, DAG, Flags);
+ // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
+ // library behavior. Also, is known-not-daz source sufficient?
+ if (allowApproxFunc(DAG, Flags)) { // TODO: Does this really require fast?
+ return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
+ : lowerFEXPUnsafe(X, SL, DAG, Flags);
+ }
+ if (VT.getScalarType() == MVT::f16) {
if (VT.isVector())
return SDValue();
+ // Nothing in half is a denormal when promoted to f32.
+ //
// exp(f16 x) ->
// fptrunc (v_exp_f32 (fmul (fpext x), log2e))
-
- // Nothing in half is a denormal when promoted to f32.
+ //
+ // exp10(f16 x) ->
+ // fptrunc (v_exp_f32 (fmul (fpext x), log2(10)))
SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags);
- SDValue Lowered = lowerFEXPUnsafe(Ext, SL, DAG, Flags);
+ SDValue Lowered = lowerFEXPUnsafeImpl(Ext, SL, DAG, Flags, IsExp10);
return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered,
DAG.getTargetConstant(0, SL, MVT::i32), Flags);
}
assert(VT == MVT::f32);
- // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
- // library behavior. Also, is known-not-daz source sufficient?
- if (allowApproxFunc(DAG, Flags)) {
- return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
- : lowerFEXPUnsafe(X, SL, DAG, Flags);
- }
-
// Algorithm:
//
// e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
@@ -3369,8 +3389,9 @@ SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
// Get the 32-bit normalized integer.
Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
// Convert the normalized 32-bit integer into f32.
- unsigned Opc =
- (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
+
+ bool UseLDEXP = isOperationLegal(ISD::FLDEXP, MVT::f32);
+ unsigned Opc = Signed && UseLDEXP ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
// Finally, need to scale back the converted floating number as the original
@@ -3378,7 +3399,7 @@ SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
ShAmt);
// On GCN, use LDEXP directly.
- if (Subtarget->isGCN())
+ if (UseLDEXP)
return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);
// Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
@@ -3445,7 +3466,7 @@ SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
if (SrcVT != MVT::i64)
return Op;
- if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
+ if (DestVT == MVT::f16 && isTypeLegal(MVT::f16)) {
SDLoc DL(Op);
SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
@@ -3493,7 +3514,7 @@ SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
// TODO: Factor out code common with LowerUINT_TO_FP.
- if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
+ if (DestVT == MVT::f16 && isTypeLegal(MVT::f16)) {
SDLoc DL(Op);
SDValue Src = Op.getOperand(0);
@@ -3737,6 +3758,86 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_INT(const SDValue Op,
return SDValue();
}
+SDValue AMDGPUTargetLowering::LowerFP_TO_INT_SAT(const SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue Src = Op.getOperand(0);
+ unsigned OpOpcode = Op.getOpcode();
+ EVT SrcVT = Src.getValueType();
+ EVT DstVT = Op.getValueType();
+ SDValue SatVTOp = Op.getNode()->getOperand(1);
+ EVT SatVT = cast<VTSDNode>(SatVTOp)->getVT();
+ SDLoc DL(Op);
+
+ uint64_t DstWidth = DstVT.getScalarSizeInBits();
+ uint64_t SatWidth = SatVT.getScalarSizeInBits();
+ assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
+
+ // Will be selected natively
+ if (DstVT == MVT::i32 && SatWidth == DstWidth &&
+ (SrcVT == MVT::f32 || SrcVT == MVT::f64))
+ return Op;
+
+ const SDValue Int32VT = DAG.getValueType(MVT::i32);
+
+ // Perform all saturation at i32 and truncate
+ if (SatWidth < DstWidth) {
+ const uint64_t Int32Width = 32;
+ SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src, Int32VT);
+ SDValue Int32SatVal;
+
+ if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
+ SDValue MinConst = DAG.getConstant(
+ APInt::getSignedMaxValue(SatWidth).sext(Int32Width), DL, MVT::i32);
+ SDValue MaxConst = DAG.getConstant(
+ APInt::getSignedMinValue(SatWidth).sext(Int32Width), DL, MVT::i32);
+ SDValue MinVal =
+ DAG.getNode(ISD::SMIN, DL, MVT::i32, FpToInt32, MinConst);
+ Int32SatVal = DAG.getNode(ISD::SMAX, DL, MVT::i32, MinVal, MaxConst);
+ } else {
+ SDValue MinConst = DAG.getConstant(
+ APInt::getMaxValue(SatWidth).zext(Int32Width), DL, MVT::i32);
+ Int32SatVal = DAG.getNode(ISD::UMIN, DL, MVT::i32, FpToInt32, MinConst);
+ }
+
+ if (DstWidth == Int32Width)
+ return Int32SatVal;
+ if (DstWidth < Int32Width)
+ return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Int32SatVal);
+
+ // DstWidth > Int32Width
+ const unsigned Ext =
+ OpOpcode == ISD::FP_TO_SINT_SAT ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+ return DAG.getNode(Ext, DL, DstVT, FpToInt32);
+ }
+
+ // SatWidth == DstWidth
+
+ // Saturate at i32 for i64 dst and 16b src (will invoke f16 promotion below)
+ if (DstVT == MVT::i64 &&
+ (SrcVT == MVT::f16 || SrcVT == MVT::bf16 ||
+ (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP))) {
+ return DAG.getNode(OpOpcode, DL, DstVT, Src, Int32VT);
+ }
+
+ // Promote f16/bf16 src to f32
+ if (SrcVT == MVT::f16 || SrcVT == MVT::bf16) {
+ SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
+ return DAG.getNode(Op.getOpcode(), DL, DstVT, PromotedSrc, SatVTOp);
+ }
+
+ // Promote sub-i32 dst to i32 with sub-i32 saturation
+ if (DstWidth < 32) {
+ // Note: this triggers SatWidth < DstWidth above to generate saturated
+ // truncate by requesting MVT::i32 destination with SatWidth < 32.
+ SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src, SatVTOp);
+ return DAG.getNode(ISD::TRUNCATE, DL, DstVT, FpToInt32);
+ }
+
+ // TODO: can we implement i64 dst for f32/f64?
+
+ return SDValue();
+}
+
SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
SelectionDAG &DAG) const {
EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
@@ -4125,8 +4226,7 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
EVT ElementType = VT.getScalarType();
EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
- EVT TargetType = VT.isVector() ? VT.changeVectorElementType(TargetScalarType)
- : TargetScalarType;
+ EVT TargetType = VT.changeElementType(*DAG.getContext(), TargetScalarType);
if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
return SDValue();
@@ -4190,8 +4290,7 @@ SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
EVT ElementType = VT.getScalarType();
EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
- EVT TargetType = VT.isVector() ? VT.changeVectorElementType(TargetScalarType)
- : TargetScalarType;
+ EVT TargetType = VT.changeElementType(*DAG.getContext(), TargetScalarType);
if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
return SDValue();
@@ -4312,8 +4411,7 @@ SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
EVT ElementType = VT.getScalarType();
EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
- EVT TargetType = VT.isVector() ? VT.changeVectorElementType(TargetScalarType)
- : TargetScalarType;
+ EVT TargetType = VT.changeElementType(*DAG.getContext(), TargetScalarType);
if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
return SDValue();
@@ -4547,7 +4645,7 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
}
// There are i16 integer mul/mad.
- if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
+ if (isTypeLegal(MVT::i16) && VT.getScalarType().bitsLE(MVT::i16))
return SDValue();
// SimplifyDemandedBits has the annoying habit of turning useful zero_extends
@@ -4666,7 +4764,7 @@ SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
EVT VT = N->getValueType(0);
- if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
+ if (VT.isVector() || VT.getSizeInBits() > 32 || !Subtarget->hasMulU24())
return SDValue();
// Don't generate 24-bit multiplies on values that are in SGPRs, since
@@ -4675,7 +4773,7 @@ SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
// value is in an SGPR.
// This doesn't apply if no s_mul_hi is available (since we'll end up with a
// valu op anyway)
- if (Subtarget->hasSMulHi() && !N->isDivergent())
+ if (!N->isDivergent() && Subtarget->hasSMulHi())
return SDValue();
SelectionDAG &DAG = DCI.DAG;
@@ -4700,9 +4798,7 @@ SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
const SDLoc &DL,
unsigned Opc) const {
EVT VT = Op.getValueType();
- EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
- if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
- LegalVT != MVT::i16))
+ if (VT.bitsGT(MVT::i32))
return SDValue();
if (VT != MVT::i32)
@@ -4999,7 +5095,7 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
SDLoc SL(N);
switch (Opc) {
case ISD::FADD: {
- if (!mayIgnoreSignedZero(N0))
+ if (!mayIgnoreSignedZero(N0) && !N->getFlags().hasNoSignedZeros())
return SDValue();
// (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
@@ -5047,7 +5143,7 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
case ISD::FMA:
case ISD::FMAD: {
// TODO: handle llvm.amdgcn.fma.legacy
- if (!mayIgnoreSignedZero(N0))
+ if (!mayIgnoreSignedZero(N0) && !N->getFlags().hasNoSignedZeros())
return SDValue();
// (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
@@ -5259,7 +5355,7 @@ SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
switch (N0.getOpcode()) {
case ISD::FP16_TO_FP: {
- assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
+ assert(!isTypeLegal(MVT::f16) && "should only see if f16 is illegal");
SDLoc SL(N);
SDValue Src = N0.getOperand(0);
EVT SrcVT = Src.getValueType();
@@ -5459,7 +5555,7 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
}
if ((OffsetVal + WidthVal) >= 32 &&
- !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
+ !(OffsetVal == 16 && WidthVal == 16 && Subtarget->hasSDWA())) {
SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
BitsFrom, ShiftVal);
@@ -5649,169 +5745,6 @@ uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
return getImplicitParameterOffset(MFI->getExplicitKernArgSize(), Param);
}
-#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
-
-const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
- switch ((AMDGPUISD::NodeType)Opcode) {
- case AMDGPUISD::FIRST_NUMBER: break;
- // AMDIL DAG nodes
- NODE_NAME_CASE(BRANCH_COND);
-
- // AMDGPU DAG nodes
- NODE_NAME_CASE(IF)
- NODE_NAME_CASE(ELSE)
- NODE_NAME_CASE(LOOP)
- NODE_NAME_CASE(CALL)
- NODE_NAME_CASE(TC_RETURN)
- NODE_NAME_CASE(TC_RETURN_GFX)
- NODE_NAME_CASE(TC_RETURN_GFX_WholeWave)
- NODE_NAME_CASE(TC_RETURN_CHAIN)
- NODE_NAME_CASE(TC_RETURN_CHAIN_DVGPR)
- NODE_NAME_CASE(TRAP)
- NODE_NAME_CASE(RET_GLUE)
- NODE_NAME_CASE(WAVE_ADDRESS)
- NODE_NAME_CASE(RETURN_TO_EPILOG)
- NODE_NAME_CASE(ENDPGM)
- NODE_NAME_CASE(ENDPGM_TRAP)
- NODE_NAME_CASE(SIMULATED_TRAP)
- NODE_NAME_CASE(DWORDADDR)
- NODE_NAME_CASE(FRACT)
- NODE_NAME_CASE(SETCC)
- NODE_NAME_CASE(DENORM_MODE)
- NODE_NAME_CASE(FMA_W_CHAIN)
- NODE_NAME_CASE(FMUL_W_CHAIN)
- NODE_NAME_CASE(CLAMP)
- NODE_NAME_CASE(COS_HW)
- NODE_NAME_CASE(SIN_HW)
- NODE_NAME_CASE(FMAX_LEGACY)
- NODE_NAME_CASE(FMIN_LEGACY)
- NODE_NAME_CASE(FMAX3)
- NODE_NAME_CASE(SMAX3)
- NODE_NAME_CASE(UMAX3)
- NODE_NAME_CASE(FMIN3)
- NODE_NAME_CASE(SMIN3)
- NODE_NAME_CASE(UMIN3)
- NODE_NAME_CASE(FMED3)
- NODE_NAME_CASE(SMED3)
- NODE_NAME_CASE(UMED3)
- NODE_NAME_CASE(FMAXIMUM3)
- NODE_NAME_CASE(FMINIMUM3)
- NODE_NAME_CASE(FDOT2)
- NODE_NAME_CASE(URECIP)
- NODE_NAME_CASE(DIV_SCALE)
- NODE_NAME_CASE(DIV_FMAS)
- NODE_NAME_CASE(DIV_FIXUP)
- NODE_NAME_CASE(FMAD_FTZ)
- NODE_NAME_CASE(RCP)
- NODE_NAME_CASE(RSQ)
- NODE_NAME_CASE(RCP_LEGACY)
- NODE_NAME_CASE(RCP_IFLAG)
- NODE_NAME_CASE(LOG)
- NODE_NAME_CASE(EXP)
- NODE_NAME_CASE(FMUL_LEGACY)
- NODE_NAME_CASE(RSQ_CLAMP)
- NODE_NAME_CASE(FP_CLASS)
- NODE_NAME_CASE(DOT4)
- NODE_NAME_CASE(CARRY)
- NODE_NAME_CASE(BORROW)
- NODE_NAME_CASE(BFE_U32)
- NODE_NAME_CASE(BFE_I32)
- NODE_NAME_CASE(BFI)
- NODE_NAME_CASE(BFM)
- NODE_NAME_CASE(FFBH_U32)
- NODE_NAME_CASE(FFBH_I32)
- NODE_NAME_CASE(FFBL_B32)
- NODE_NAME_CASE(MUL_U24)
- NODE_NAME_CASE(MUL_I24)
- NODE_NAME_CASE(MULHI_U24)
- NODE_NAME_CASE(MULHI_I24)
- NODE_NAME_CASE(MAD_U24)
- NODE_NAME_CASE(MAD_I24)
- NODE_NAME_CASE(MAD_I64_I32)
- NODE_NAME_CASE(MAD_U64_U32)
- NODE_NAME_CASE(PERM)
- NODE_NAME_CASE(TEXTURE_FETCH)
- NODE_NAME_CASE(R600_EXPORT)
- NODE_NAME_CASE(CONST_ADDRESS)
- NODE_NAME_CASE(REGISTER_LOAD)
- NODE_NAME_CASE(REGISTER_STORE)
- NODE_NAME_CASE(CVT_F32_UBYTE0)
- NODE_NAME_CASE(CVT_F32_UBYTE1)
- NODE_NAME_CASE(CVT_F32_UBYTE2)
- NODE_NAME_CASE(CVT_F32_UBYTE3)
- NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
- NODE_NAME_CASE(CVT_PKNORM_I16_F32)
- NODE_NAME_CASE(CVT_PKNORM_U16_F32)
- NODE_NAME_CASE(CVT_PK_I16_I32)
- NODE_NAME_CASE(CVT_PK_U16_U32)
- NODE_NAME_CASE(FP_TO_FP16)
- NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
- NODE_NAME_CASE(CONST_DATA_PTR)
- NODE_NAME_CASE(PC_ADD_REL_OFFSET)
- NODE_NAME_CASE(PC_ADD_REL_OFFSET64)
- NODE_NAME_CASE(LDS)
- NODE_NAME_CASE(DUMMY_CHAIN)
- NODE_NAME_CASE(LOAD_D16_HI)
- NODE_NAME_CASE(LOAD_D16_LO)
- NODE_NAME_CASE(LOAD_D16_HI_I8)
- NODE_NAME_CASE(LOAD_D16_HI_U8)
- NODE_NAME_CASE(LOAD_D16_LO_I8)
- NODE_NAME_CASE(LOAD_D16_LO_U8)
- NODE_NAME_CASE(STORE_MSKOR)
- NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
- NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)
- NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
- NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
- NODE_NAME_CASE(DS_ORDERED_COUNT)
- NODE_NAME_CASE(ATOMIC_CMP_SWAP)
- NODE_NAME_CASE(BUFFER_LOAD)
- NODE_NAME_CASE(BUFFER_LOAD_UBYTE)
- NODE_NAME_CASE(BUFFER_LOAD_USHORT)
- NODE_NAME_CASE(BUFFER_LOAD_BYTE)
- NODE_NAME_CASE(BUFFER_LOAD_SHORT)
- NODE_NAME_CASE(BUFFER_LOAD_TFE)
- NODE_NAME_CASE(BUFFER_LOAD_UBYTE_TFE)
- NODE_NAME_CASE(BUFFER_LOAD_USHORT_TFE)
- NODE_NAME_CASE(BUFFER_LOAD_BYTE_TFE)
- NODE_NAME_CASE(BUFFER_LOAD_SHORT_TFE)
- NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
- NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE)
- NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
- NODE_NAME_CASE(SBUFFER_LOAD)
- NODE_NAME_CASE(SBUFFER_LOAD_BYTE)
- NODE_NAME_CASE(SBUFFER_LOAD_UBYTE)
- NODE_NAME_CASE(SBUFFER_LOAD_SHORT)
- NODE_NAME_CASE(SBUFFER_LOAD_USHORT)
- NODE_NAME_CASE(SBUFFER_PREFETCH_DATA)
- NODE_NAME_CASE(BUFFER_STORE)
- NODE_NAME_CASE(BUFFER_STORE_BYTE)
- NODE_NAME_CASE(BUFFER_STORE_SHORT)
- NODE_NAME_CASE(BUFFER_STORE_FORMAT)
- NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
- NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)
- NODE_NAME_CASE(BUFFER_ATOMIC_ADD)
- NODE_NAME_CASE(BUFFER_ATOMIC_SUB)
- NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)
- NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)
- NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)
- NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)
- NODE_NAME_CASE(BUFFER_ATOMIC_AND)
- NODE_NAME_CASE(BUFFER_ATOMIC_OR)
- NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
- NODE_NAME_CASE(BUFFER_ATOMIC_INC)
- NODE_NAME_CASE(BUFFER_ATOMIC_DEC)
- NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
- NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)
- NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
- NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
- NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
- NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32)
- NODE_NAME_CASE(WHOLE_WAVE_SETUP)
- NODE_NAME_CASE(WHOLE_WAVE_RETURN)
- }
- return nullptr;
-}
-
SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
SelectionDAG &DAG, int Enabled,
int &RefinementSteps,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index bdaf486..adbc2c7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -51,7 +51,6 @@ protected:
/// Split a vector store into multiple scalar stores.
/// \returns The resulting chain.
- SDValue LowerFREM(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const;
@@ -78,6 +77,9 @@ protected:
bool IsLog10, SDNodeFlags Flags) const;
SDValue lowerFEXP2(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerFEXPUnsafeImpl(SDValue Op, const SDLoc &SL, SelectionDAG &DAG,
+ SDNodeFlags Flags, bool IsExp10) const;
+
SDValue lowerFEXPUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG,
SDNodeFlags Flags) const;
SDValue lowerFEXP10Unsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG,
@@ -96,6 +98,7 @@ protected:
SDValue LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG, bool Signed) const;
SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL,
SelectionDAG &DAG) const;
@@ -180,7 +183,8 @@ protected:
const SmallVectorImpl<ISD::InputArg> &Ins) const;
public:
- AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI);
+ AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI,
+ const AMDGPUSubtarget &AMDGPUSTI);
bool mayIgnoreSignedZero(SDValue Op) const;
@@ -280,8 +284,6 @@ public:
SDValue RHS, SDValue True, SDValue False,
SDValue CC, DAGCombinerInfo &DCI) const;
- const char* getTargetNodeName(unsigned Opcode) const override;
-
// FIXME: Turn off MergeConsecutiveStores() before Instruction Selection for
// AMDGPU. Commit r319036,
// (https://github.com/llvm/llvm-project/commit/db77e57ea86d941a4262ef60261692f4cb6893e6)
@@ -406,235 +408,6 @@ public:
}
};
-namespace AMDGPUISD {
-
-enum NodeType : unsigned {
- // AMDIL ISD Opcodes
- FIRST_NUMBER = ISD::BUILTIN_OP_END,
- BRANCH_COND,
- // End AMDIL ISD Opcodes
-
- // Function call.
- CALL,
- TC_RETURN,
- TC_RETURN_GFX,
- TC_RETURN_GFX_WholeWave,
- TC_RETURN_CHAIN,
- TC_RETURN_CHAIN_DVGPR,
- TRAP,
-
- // Masked control flow nodes.
- IF,
- ELSE,
- LOOP,
-
- // A uniform kernel return that terminates the wavefront.
- ENDPGM,
-
- // s_endpgm, but we may want to insert it in the middle of the block.
- ENDPGM_TRAP,
-
- // "s_trap 2" equivalent on hardware that does not support it.
- SIMULATED_TRAP,
-
- // Return to a shader part's epilog code.
- RETURN_TO_EPILOG,
-
- // Return with values from a non-entry function.
- RET_GLUE,
-
- // Convert a unswizzled wave uniform stack address to an address compatible
- // with a vector offset for use in stack access.
- WAVE_ADDRESS,
-
- DWORDADDR,
- FRACT,
-
- /// CLAMP value between 0.0 and 1.0. NaN clamped to 0, following clamp output
- /// modifier behavior with dx10_enable.
- CLAMP,
-
- // This is SETCC with the full mask result which is used for a compare with a
- // result bit per item in the wavefront.
- SETCC,
-
- DENORM_MODE,
-
- // FP ops with input and output chain.
- FMA_W_CHAIN,
- FMUL_W_CHAIN,
-
- // SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi.
- // Denormals handled on some parts.
- COS_HW,
- SIN_HW,
- FMAX_LEGACY,
- FMIN_LEGACY,
-
- FMAX3,
- SMAX3,
- UMAX3,
- FMIN3,
- SMIN3,
- UMIN3,
- FMED3,
- SMED3,
- UMED3,
- FMAXIMUM3,
- FMINIMUM3,
- FDOT2,
- URECIP,
- DIV_SCALE,
- DIV_FMAS,
- DIV_FIXUP,
- // For emitting ISD::FMAD when f32 denormals are enabled because mac/mad is
- // treated as an illegal operation.
- FMAD_FTZ,
-
- // RCP, RSQ - For f32, 1 ULP max error, no denormal handling.
- // For f64, max error 2^29 ULP, handles denormals.
- RCP,
- RSQ,
- RCP_LEGACY,
- RCP_IFLAG,
-
- // log2, no denormal handling for f32.
- LOG,
-
- // exp2, no denormal handling for f32.
- EXP,
-
- FMUL_LEGACY,
- RSQ_CLAMP,
- FP_CLASS,
- DOT4,
- CARRY,
- BORROW,
- BFE_U32, // Extract range of bits with zero extension to 32-bits.
- BFE_I32, // Extract range of bits with sign extension to 32-bits.
- BFI, // (src0 & src1) | (~src0 & src2)
- BFM, // Insert a range of bits into a 32-bit word.
- FFBH_U32, // ctlz with -1 if input is zero.
- FFBH_I32,
- FFBL_B32, // cttz with -1 if input is zero.
- MUL_U24,
- MUL_I24,
- MULHI_U24,
- MULHI_I24,
- MAD_U24,
- MAD_I24,
- MAD_U64_U32,
- MAD_I64_I32,
- PERM,
- TEXTURE_FETCH,
- R600_EXPORT,
- CONST_ADDRESS,
- REGISTER_LOAD,
- REGISTER_STORE,
-
- // These cvt_f32_ubyte* nodes need to remain consecutive and in order.
- CVT_F32_UBYTE0,
- CVT_F32_UBYTE1,
- CVT_F32_UBYTE2,
- CVT_F32_UBYTE3,
-
- // Convert two float 32 numbers into a single register holding two packed f16
- // with round to zero.
- CVT_PKRTZ_F16_F32,
- CVT_PKNORM_I16_F32,
- CVT_PKNORM_U16_F32,
- CVT_PK_I16_I32,
- CVT_PK_U16_U32,
-
- // Same as the standard node, except the high bits of the resulting integer
- // are known 0.
- FP_TO_FP16,
-
- /// This node is for VLIW targets and it is used to represent a vector
- /// that is stored in consecutive registers with the same channel.
- /// For example:
- /// |X |Y|Z|W|
- /// T0|v.x| | | |
- /// T1|v.y| | | |
- /// T2|v.z| | | |
- /// T3|v.w| | | |
- BUILD_VERTICAL_VECTOR,
- /// Pointer to the start of the shader's constant data.
- CONST_DATA_PTR,
- PC_ADD_REL_OFFSET,
- PC_ADD_REL_OFFSET64,
- LDS,
-
- DUMMY_CHAIN,
-
- FIRST_MEMORY_OPCODE,
- LOAD_D16_HI = FIRST_MEMORY_OPCODE,
- LOAD_D16_LO,
- LOAD_D16_HI_I8,
- LOAD_D16_HI_U8,
- LOAD_D16_LO_I8,
- LOAD_D16_LO_U8,
-
- STORE_MSKOR,
- TBUFFER_STORE_FORMAT,
- TBUFFER_STORE_FORMAT_D16,
- TBUFFER_LOAD_FORMAT,
- TBUFFER_LOAD_FORMAT_D16,
- DS_ORDERED_COUNT,
- ATOMIC_CMP_SWAP,
- BUFFER_LOAD,
- BUFFER_LOAD_UBYTE,
- BUFFER_LOAD_USHORT,
- BUFFER_LOAD_BYTE,
- BUFFER_LOAD_SHORT,
- BUFFER_LOAD_TFE,
- BUFFER_LOAD_UBYTE_TFE,
- BUFFER_LOAD_USHORT_TFE,
- BUFFER_LOAD_BYTE_TFE,
- BUFFER_LOAD_SHORT_TFE,
- BUFFER_LOAD_FORMAT,
- BUFFER_LOAD_FORMAT_TFE,
- BUFFER_LOAD_FORMAT_D16,
- SBUFFER_LOAD,
- SBUFFER_LOAD_BYTE,
- SBUFFER_LOAD_UBYTE,
- SBUFFER_LOAD_SHORT,
- SBUFFER_LOAD_USHORT,
- SBUFFER_PREFETCH_DATA,
- BUFFER_STORE,
- BUFFER_STORE_BYTE,
- BUFFER_STORE_SHORT,
- BUFFER_STORE_FORMAT,
- BUFFER_STORE_FORMAT_D16,
- BUFFER_ATOMIC_SWAP,
- BUFFER_ATOMIC_ADD,
- BUFFER_ATOMIC_SUB,
- BUFFER_ATOMIC_SMIN,
- BUFFER_ATOMIC_UMIN,
- BUFFER_ATOMIC_SMAX,
- BUFFER_ATOMIC_UMAX,
- BUFFER_ATOMIC_AND,
- BUFFER_ATOMIC_OR,
- BUFFER_ATOMIC_XOR,
- BUFFER_ATOMIC_INC,
- BUFFER_ATOMIC_DEC,
- BUFFER_ATOMIC_CMPSWAP,
- BUFFER_ATOMIC_CSUB,
- BUFFER_ATOMIC_FADD,
- BUFFER_ATOMIC_FMIN,
- BUFFER_ATOMIC_FMAX,
- BUFFER_ATOMIC_COND_SUB_U32,
- LAST_MEMORY_OPCODE = BUFFER_ATOMIC_COND_SUB_U32,
-
- // Set up a whole wave function.
- WHOLE_WAVE_SETUP,
-
- // Return from a whole wave function.
- WHOLE_WAVE_RETURN,
-};
-
-} // End namespace AMDGPUISD
-
} // End namespace llvm
#endif
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
index 9a90787..d1b9fb4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
@@ -68,10 +68,12 @@ public:
// Get the delay type for a MachineInstr.
DelayType getDelayType(const MachineInstr &MI) {
- if (SIInstrInfo::isTRANS(MI))
+ // Non-F64 TRANS instructions use a separate delay type.
+ if (SIInstrInfo::isTRANS(MI) &&
+ !AMDGPU::isDPMACCInstruction(MI.getOpcode()))
return TRANS;
// WMMA XDL ops are treated the same as TRANS.
- if (AMDGPU::isGFX1250(*ST) && SII->isXDLWMMA(MI))
+ if (ST->hasGFX1250Insts() && SII->isXDLWMMA(MI))
return TRANS;
if (SIInstrInfo::isVALU(MI))
return VALU;
@@ -221,7 +223,7 @@ public:
};
// A map from regunits to the delay info for that regunit.
- struct DelayState : DenseMap<unsigned, DelayInfo> {
+ struct DelayState : DenseMap<MCRegUnit, DelayInfo> {
// Merge another DelayState into this one by merging the delay info for each
// regunit.
void merge(const DelayState &RHS) {
@@ -325,6 +327,13 @@ public:
for (auto I = MachineBasicBlock::instr_iterator(LastDelayAlu),
E = MachineBasicBlock::instr_iterator(MI);
++I != E;) {
+ if (I->getOpcode() == AMDGPU::S_SET_VGPR_MSB) {
+ // It is not deterministic whether the skip count counts
+ // S_SET_VGPR_MSB instructions or not, so do not include them in a
+ // skip region.
+ Skip = 6;
+ break;
+ }
if (!I->isBundle() && !I->isMetaInstruction())
++Skip;
}
@@ -359,7 +368,8 @@ public:
bool Changed = false;
MachineInstr *LastDelayAlu = nullptr;
- MCRegUnit LastSGPRFromVALU = 0;
+ // FIXME: 0 is a valid register unit.
+ MCRegUnit LastSGPRFromVALU = static_cast<MCRegUnit>(0);
// Iterate over the contents of bundles, but don't emit any instructions
// inside a bundle.
for (auto &MI : MBB.instrs()) {
@@ -379,7 +389,8 @@ public:
if (It != State.end()) {
DelayInfo Info = It->getSecond();
State.advanceByVALUNum(Info.VALUNum);
- LastSGPRFromVALU = 0;
+ // FIXME: 0 is a valid register unit.
+ LastSGPRFromVALU = static_cast<MCRegUnit>(0);
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 4fe5d00..376184e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -35,7 +35,7 @@ struct AMDGPUImageDMaskIntrinsic {
};
#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
-#include "InstCombineTables.inc"
+#include "AMDGPUGenSearchableTables.inc"
} // end anonymous namespace
@@ -553,6 +553,89 @@ static CallInst *rewriteCall(IRBuilderBase &B, CallInst &Old,
return NewCall;
}
+// Return true for sequences of instructions that effectively assign
+// each lane to its thread ID
+static bool isThreadID(const GCNSubtarget &ST, Value *V) {
+ // Case 1:
+ // wave32: mbcnt_lo(-1, 0)
+ // wave64: mbcnt_hi(-1, mbcnt_lo(-1, 0))
+ auto W32Pred = m_Intrinsic<Intrinsic::amdgcn_mbcnt_lo>(m_ConstantInt<-1>(),
+ m_ConstantInt<0>());
+ auto W64Pred = m_Intrinsic<Intrinsic::amdgcn_mbcnt_hi>(
+ m_ConstantInt<-1>(), m_Intrinsic<Intrinsic::amdgcn_mbcnt_lo>(
+ m_ConstantInt<-1>(), m_ConstantInt<0>()));
+ if (ST.isWave32() && match(V, W32Pred))
+ return true;
+ if (ST.isWave64() && match(V, W64Pred))
+ return true;
+
+ return false;
+}
+
+// Attempt to capture situations where the index argument matches
+// a DPP pattern, and convert to a DPP-based mov
+static std::optional<Instruction *>
+tryWaveShuffleDPP(const GCNSubtarget &ST, InstCombiner &IC, IntrinsicInst &II) {
+ Value *Val = II.getArgOperand(0);
+ Value *Idx = II.getArgOperand(1);
+ auto &B = IC.Builder;
+
+ // DPP16 Row Share requires known wave size, architecture support
+ if (!ST.isWaveSizeKnown() || !ST.hasDPPRowShare())
+ return std::nullopt;
+
+ Value *Tid;
+ uint64_t Mask;
+ uint64_t RowIdx;
+ bool CanDPP16RowShare = false;
+
+ // wave32 requires Mask & 0x1F == 0x10
+ // wave64 requires Mask & 0x3F == 0x30
+ uint64_t MaskCheck = (1UL << ST.getWavefrontSizeLog2()) - 1;
+ uint64_t MaskTarget = MaskCheck & 0xF0;
+
+ // DPP16 Row Share 0: Idx = Tid & Mask
+ auto RowShare0Pred = m_And(m_Value(Tid), m_ConstantInt(Mask));
+
+ // DPP16 Row Share (0 < Row < 15): Idx = (Tid & Mask) | RowIdx
+ auto RowSharePred =
+ m_Or(m_And(m_Value(Tid), m_ConstantInt(Mask)), m_ConstantInt(RowIdx));
+
+ // DPP16 Row Share 15: Idx = Tid | 0xF
+ auto RowShare15Pred = m_Or(m_Value(Tid), m_ConstantInt<0xF>());
+
+ if (match(Idx, RowShare0Pred) && isThreadID(ST, Tid)) {
+ if ((Mask & MaskCheck) != MaskTarget)
+ return std::nullopt;
+
+ RowIdx = 0;
+ CanDPP16RowShare = true;
+ } else if (match(Idx, RowSharePred) && isThreadID(ST, Tid) && RowIdx < 15 &&
+ RowIdx > 0) {
+ if ((Mask & MaskCheck) != MaskTarget)
+ return std::nullopt;
+
+ CanDPP16RowShare = true;
+ } else if (match(Idx, RowShare15Pred) && isThreadID(ST, Tid)) {
+ RowIdx = 15;
+ CanDPP16RowShare = true;
+ }
+
+ if (CanDPP16RowShare) {
+ CallInst *UpdateDPP =
+ B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, Val->getType(),
+ {PoisonValue::get(Val->getType()), Val,
+ B.getInt32(AMDGPU::DPP::ROW_SHARE0 | RowIdx),
+ B.getInt32(0xF), B.getInt32(0xF), B.getFalse()});
+ UpdateDPP->takeName(&II);
+ UpdateDPP->copyMetadata(II);
+ return IC.replaceInstUsesWith(II, UpdateDPP);
+ }
+
+ // No valid DPP detected
+ return std::nullopt;
+}
+
Instruction *
GCNTTIImpl::hoistLaneIntrinsicThroughOperand(InstCombiner &IC,
IntrinsicInst &II) const {
@@ -788,7 +871,8 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
Exp = 0;
- return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp));
+ return IC.replaceInstUsesWith(II,
+ ConstantInt::getSigned(II.getType(), Exp));
}
if (isa<PoisonValue>(Src))
@@ -1458,30 +1542,30 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
if (isa<PoisonValue>(Src) || isa<PoisonValue>(Segment))
return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
- if (isa<UndefValue>(Src)) {
- auto *QNaN = ConstantFP::get(
- II.getType(), APFloat::getQNaN(II.getType()->getFltSemantics()));
- return IC.replaceInstUsesWith(II, QNaN);
- }
+ if (isa<UndefValue>(Segment))
+ return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType()));
- const ConstantFP *Csrc = dyn_cast<ConstantFP>(Src);
- if (!Csrc)
+ if (II.isStrictFP())
break;
- if (II.isStrictFP())
+ const ConstantFP *CSrc = dyn_cast<ConstantFP>(Src);
+ if (!CSrc && !isa<UndefValue>(Src))
break;
- const APFloat &Fsrc = Csrc->getValueAPF();
- if (Fsrc.isNaN()) {
- auto *Quieted = ConstantFP::get(II.getType(), Fsrc.makeQuiet());
- return IC.replaceInstUsesWith(II, Quieted);
- }
+ // The instruction ignores special cases, and literally just extracts the
+ // exponents. Fold undef to nan, and index the table as normal.
+ APInt FSrcInt = CSrc ? CSrc->getValueAPF().bitcastToAPInt()
+ : APFloat::getQNaN(II.getType()->getFltSemantics())
+ .bitcastToAPInt();
const ConstantInt *Cseg = dyn_cast<ConstantInt>(Segment);
- if (!Cseg)
+ if (!Cseg) {
+ if (isa<UndefValue>(Src))
+ return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType()));
break;
+ }
- unsigned Exponent = (Fsrc.bitcastToAPInt().getZExtValue() >> 52) & 0x7ff;
+ unsigned Exponent = FSrcInt.extractBitsAsZExtValue(11, 52);
unsigned SegmentVal = Cseg->getValue().trunc(5).getZExtValue();
unsigned Shift = SegmentVal * 53;
if (Exponent > 1077)
@@ -1737,6 +1821,33 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
NewII->takeName(&II);
return IC.replaceInstUsesWith(II, NewII);
}
+ case Intrinsic::amdgcn_tensor_load_to_lds:
+ case Intrinsic::amdgcn_tensor_store_from_lds: {
+ Value *D2 = II.getArgOperand(2);
+ Value *D3 = II.getArgOperand(3);
+ // We know that not passing the second and third tensor DMA groups is
+ // equivalent to passing zeroes for those registers, so we rewrite to the
+ // shorter form here. Undef or poison are replaced by 0.
+ auto Pred = m_CombineOr(m_Zero(), m_Undef());
+ if (!match(D2, Pred) || !match(D3, Pred))
+ return std::nullopt;
+
+ auto ShortIntrinsic = IID == Intrinsic::amdgcn_tensor_load_to_lds
+ ? Intrinsic::amdgcn_tensor_load_to_lds_d2
+ : Intrinsic::amdgcn_tensor_store_from_lds_d2;
+ CallInst *NewII = IC.Builder.CreateIntrinsic(
+ ShortIntrinsic,
+ {II.getArgOperand(0), II.getArgOperand(1), II.getArgOperand(4)});
+ NewII->takeName(&II);
+ NewII->copyMetadata(II);
+ return IC.eraseInstFromFunction(II);
+ }
+ case Intrinsic::amdgcn_wave_shuffle: {
+ if (!ST->hasDPP())
+ return std::nullopt;
+
+ return tryWaveShuffleDPP(*ST, IC, II);
+ }
}
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
index 5085e86..2b1f404 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
@@ -29,11 +29,19 @@ Intrinsic::ID AMDGPU::getIntrinsicID(const MachineInstr &I) {
// TODO: Should largely merge with AMDGPUTTIImpl::isSourceOfDivergence.
bool AMDGPU::isUniformMMO(const MachineMemOperand *MMO) {
const Value *Ptr = MMO->getValue();
+ if (!Ptr) {
+ if (const PseudoSourceValue *PSV = MMO->getPseudoValue()) {
+ return PSV->isConstantPool() || PSV->isStack() || PSV->isGOT() ||
+ PSV->isJumpTable();
+ }
+
+ // Unknown value.
+ return false;
+ }
+
// UndefValue means this is a load of a kernel input. These are uniform.
// Sometimes LDS instructions have constant pointers.
- // If Ptr is null, then that means this mem operand contains a
- // PseudoSourceValue like GOT.
- if (!Ptr || isa<UndefValue, Constant, GlobalValue>(Ptr))
+ if (isa<UndefValue, Constant, GlobalValue>(Ptr))
return true;
if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
index 0eb00cb..529da8d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -50,6 +50,7 @@ const D16ImageDimIntrinsic *lookupD16ImageDimIntrinsic(unsigned Intr);
struct ImageDimIntrinsicInfo {
unsigned Intr;
unsigned BaseOpcode;
+ unsigned AtomicNoRetBaseOpcode;
MIMGDim Dim;
uint8_t NumOffsetArgs;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index b8fa6f3..8dc5d45 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -62,6 +62,7 @@ def AMDGPUIfBreakOp : SDTypeProfile<1, 2,
// AMDGPU DAG Nodes
//
+// Masked control flow nodes.
def AMDGPUif : SDNode<"AMDGPUISD::IF", AMDGPUIfOp, [SDNPHasChain]>;
def AMDGPUelse : SDNode<"AMDGPUISD::ELSE", AMDGPUElseOp, [SDNPHasChain]>;
def AMDGPUloop : SDNode<"AMDGPUISD::LOOP", AMDGPULoopOp, [SDNPHasChain]>;
@@ -114,6 +115,7 @@ def AMDGPUtrap : SDNode<"AMDGPUISD::TRAP",
[SDNPHasChain, SDNPVariadic, SDNPSideEffect, SDNPOptInGlue]
>;
+// Pointer to the start of the shader's constant data.
def AMDGPUconstdata_ptr : SDNode<
"AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 1, [SDTCisVT<0, iPTR>,
SDTCisVT<0, iPTR>]>
@@ -122,18 +124,21 @@ def AMDGPUconstdata_ptr : SDNode<
// This argument to this node is a dword address.
def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>;
+// SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi.
+// Denormals handled on some parts.
def AMDGPUcos_impl : SDNode<"AMDGPUISD::COS_HW", SDTFPUnaryOp>;
def AMDGPUsin_impl : SDNode<"AMDGPUISD::SIN_HW", SDTFPUnaryOp>;
+
// out = a - floor(a)
def AMDGPUfract_impl : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>;
// out = 1.0 / a
def AMDGPUrcp_impl : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>;
-// v_log_f32, which is log2
+// v_log_f32, which is log2, no denormal handling for f32.
def AMDGPUlog_impl : SDNode<"AMDGPUISD::LOG", SDTFPUnaryOp>;
-// v_exp_f32, which is exp2
+// v_exp_f32, which is exp2, no denormal handling for f32.
def AMDGPUexp_impl : SDNode<"AMDGPUISD::EXP", SDTFPUnaryOp>;
// out = 1.0 / sqrt(a)
@@ -146,11 +151,16 @@ def AMDGPUrcp_iflag : SDNode<"AMDGPUISD::RCP_IFLAG", SDTFPUnaryOp>;
// out = 1.0 / sqrt(a) result clamped to +/- max_float.
def AMDGPUrsq_clamp_impl : SDNode<"AMDGPUISD::RSQ_CLAMP", SDTFPUnaryOp>;
+// Convert two float 32 numbers into a single register holding two packed f16
+// with round to zero.
def AMDGPUpkrtz_f16_f32_impl : SDNode<"AMDGPUISD::CVT_PKRTZ_F16_F32", AMDGPUFPPackOp>;
def AMDGPUpknorm_i16_f32_impl : SDNode<"AMDGPUISD::CVT_PKNORM_I16_F32", AMDGPUFPPackOp>;
def AMDGPUpknorm_u16_f32_impl : SDNode<"AMDGPUISD::CVT_PKNORM_U16_F32", AMDGPUFPPackOp>;
def AMDGPUpk_i16_i32_impl : SDNode<"AMDGPUISD::CVT_PK_I16_I32", AMDGPUIntPackOp>;
def AMDGPUpk_u16_u32_impl : SDNode<"AMDGPUISD::CVT_PK_U16_U32", AMDGPUIntPackOp>;
+
+// Same as the standard node, except the high bits of the resulting integer
+// are known 0.
def AMDGPUfp_to_f16 : SDNode<"AMDGPUISD::FP_TO_FP16" , SDTFPToIntOp>;
@@ -225,14 +235,18 @@ def AMDGPUSetCCOp : SDTypeProfile<1, 3, [ // setcc
SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, OtherVT>
]>;
+// This is SETCC with the full mask result which is used for a compare with a
+// result bit per item in the wavefront.
def AMDGPUsetcc : SDNode<"AMDGPUISD::SETCC", AMDGPUSetCCOp>;
+// FP ops with input and output chain.
def AMDGPUfma : SDNode<"AMDGPUISD::FMA_W_CHAIN", SDTFPTernaryOp, [
SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
def AMDGPUmul : SDNode<"AMDGPUISD::FMUL_W_CHAIN", SDTFPBinOp, [
SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+// These cvt_f32_ubyte* nodes need to remain consecutive and in order.
def AMDGPUcvt_f32_ubyte0 : SDNode<"AMDGPUISD::CVT_F32_UBYTE0",
SDTIntToFPOp, []>;
def AMDGPUcvt_f32_ubyte1 : SDNode<"AMDGPUISD::CVT_F32_UBYTE1",
@@ -264,6 +278,8 @@ def AMDGPUdiv_fmas_impl : SDNode<"AMDGPUISD::DIV_FMAS", AMDGPUFmasOp,
// Denominator, src2 = Numerator).
def AMDGPUdiv_fixup_impl : SDNode<"AMDGPUISD::DIV_FIXUP", SDTFPTernaryOp>;
+// For emitting ISD::FMAD when f32 denormals are enabled because mac/mad is
+// treated as an illegal operation.
def AMDGPUfmad_ftz_impl : SDNode<"AMDGPUISD::FMAD_FTZ", SDTFPTernaryOp>;
def AMDGPUregister_load : SDNode<"AMDGPUISD::REGISTER_LOAD",
@@ -290,14 +306,23 @@ def AMDGPUatomic_cmp_swap : SDNode<"AMDGPUISD::ATOMIC_CMP_SWAP",
[SDNPHasChain, SDNPMayStore, SDNPMayLoad,
SDNPMemOperand]>;
+// Extract range of bits with zero extension to 32-bits.
def AMDGPUbfe_u32_impl : SDNode<"AMDGPUISD::BFE_U32", AMDGPUDTIntTernaryOp>;
+
+// Extract range of bits with sign extension to 32-bits.
def AMDGPUbfe_i32_impl : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>;
+
+// (src0 & src1) | (~src0 & src2)
def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>;
+
+// Insert a range of bits into a 32-bit word.
def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>;
+// ctlz with -1 if input is zero.
def AMDGPUffbh_u32_impl : SDNode<"AMDGPUISD::FFBH_U32", SDTIntBitCountUnaryOp>;
def AMDGPUffbh_i32_impl : SDNode<"AMDGPUISD::FFBH_I32", SDTIntBitCountUnaryOp>;
+// cttz with -1 if input is zero.
def AMDGPUffbl_b32_impl : SDNode<"AMDGPUISD::FFBL_B32", SDTIntBitCountUnaryOp>;
// Signed and unsigned 24-bit multiply. The highest 8-bits are ignore
@@ -377,6 +402,15 @@ def AMDGPUExportOp : SDTypeProfile<0, 8, [
]>;
+def AMDGPUflat_load_monitor : SDNode<
+ "AMDGPUISD::FLAT_LOAD_MONITOR", SDTLoad,
+ [SDNPHasChain, SDNPMemOperand]
+>;
+
+def AMDGPUglobal_load_monitor : SDNode<
+ "AMDGPUISD::GLOBAL_LOAD_MONITOR", SDTLoad,
+ [SDNPHasChain, SDNPMemOperand]
+>;
//===----------------------------------------------------------------------===//
// Flow Control Profile Types
@@ -394,16 +428,24 @@ def IL_brcond : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChai
//===----------------------------------------------------------------------===//
// Call/Return DAG Nodes
//===----------------------------------------------------------------------===//
+
+// A uniform kernel return that terminates the wavefront.
def AMDGPUendpgm : SDNode<"AMDGPUISD::ENDPGM", SDTNone,
[SDNPHasChain, SDNPOptInGlue]>;
+
+// s_endpgm, but we may want to insert it in the middle of the block.
def AMDGPUendpgm_trap : SDNode<"AMDGPUISD::ENDPGM_TRAP", SDTNone,
[SDNPHasChain]>;
+
+// "s_trap 2" equivalent on hardware that does not support it.
def AMDGPUsimulated_trap : SDNode<"AMDGPUISD::SIMULATED_TRAP", SDTNone,
[SDNPHasChain]>;
+// Return to a shader part's epilog code.
def AMDGPUreturn_to_epilog : SDNode<"AMDGPUISD::RETURN_TO_EPILOG", SDTNone,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+// Return with values from a non-entry function.
def AMDGPUret_glue : SDNode<"AMDGPUISD::RET_GLUE", SDTNone,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 12915c73..82783dc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -116,8 +116,14 @@ bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
if (!DstRC || DstRC != SrcRC)
return false;
- return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
- RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
+ if (!RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) ||
+ !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
+ return false;
+ const MCInstrDesc &MCID = MI.getDesc();
+ if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
+ MI.getOperand(0).setIsEarlyClobber(true);
+ }
+ return true;
}
bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
@@ -221,14 +227,22 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {
const DebugLoc &DL = I.getDebugLoc();
MachineBasicBlock *BB = I.getParent();
+ Register VCCReg = I.getOperand(1).getReg();
+ MachineInstr *Cmp;
+
+ // Set SCC as a side effect with S_CMP or S_OR.
+ if (STI.hasScalarCompareEq64()) {
+ unsigned CmpOpc =
+ STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
+ Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)).addReg(VCCReg).addImm(0);
+ } else {
+ Register DeadDst = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ Cmp = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_OR_B64), DeadDst)
+ .addReg(VCCReg)
+ .addReg(VCCReg);
+ }
- unsigned CmpOpc =
- STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
- MachineInstr *Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc))
- .addReg(I.getOperand(1).getReg())
- .addImm(0);
- if (!constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI))
- return false;
+ constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
Register DstReg = I.getOperand(0).getReg();
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(AMDGPU::SCC);
@@ -269,7 +283,8 @@ bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(MachineInstr &I) const {
.addImm(0);
I.eraseFromParent();
- return constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
+ constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
+ return true;
}
bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const {
@@ -283,7 +298,8 @@ bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const {
.addReg(SrcReg);
I.eraseFromParent();
- return constrainSelectedInstRegOperands(*RFL, TII, TRI, RBI);
+ constrainSelectedInstRegOperands(*RFL, TII, TRI, RBI);
+ return true;
}
bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
@@ -350,7 +366,7 @@ AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
Register Reg = MO.getReg();
BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
- .addReg(Reg, 0, ComposedSubIdx);
+ .addReg(Reg, {}, ComposedSubIdx);
return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
MO.isKill(), MO.isDead(), MO.isUndef(),
@@ -400,10 +416,11 @@ bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
// Dead implicit-def of scc
I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
- true, // isImp
- false, // isKill
- true)); // isDead
- return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+ true, // isImp
+ false, // isKill
+ true)); // isDead
+ constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+ return true;
}
bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
@@ -429,15 +446,17 @@ bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
.add(I.getOperand(2))
.setOperandDead(3); // Dead scc
I.eraseFromParent();
- return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
+ constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
+ return true;
}
- if (STI.hasAddNoCarry()) {
+ if (STI.hasAddNoCarryInsts()) {
const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
I.setDesc(TII.get(Opc));
I.addOperand(*MF, MachineOperand::CreateImm(0));
I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
- return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+ constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+ return true;
}
const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
@@ -450,7 +469,8 @@ bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
.add(I.getOperand(2))
.addImm(0);
I.eraseFromParent();
- return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
+ constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
+ return true;
}
assert(!Sub && "illegal sub should not reach here");
@@ -491,8 +511,7 @@ bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
.addReg(CarryReg, RegState::Kill)
.addImm(0);
- if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
- return false;
+ constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI);
}
BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
@@ -528,7 +547,8 @@ bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
I.addOperand(*MF, MachineOperand::CreateImm(0));
- return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+ constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+ return true;
}
Register Src0Reg = I.getOperand(2).getReg();
@@ -593,7 +613,9 @@ bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
I.setDesc(TII.get(Opc));
I.addOperand(*MF, MachineOperand::CreateImm(0));
I.addImplicitDefUseOperands(*MF);
- return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+ I.getOperand(0).setIsEarlyClobber(true);
+ constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+ return true;
}
// TODO: We should probably legalize these to only using 32-bit results.
@@ -636,7 +658,7 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
*SrcRC, I.getOperand(1));
const DebugLoc &DL = I.getDebugLoc();
BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
- .addReg(SrcReg, 0, SubReg);
+ .addReg(SrcReg, {}, SubReg);
I.eraseFromParent();
return true;
@@ -709,7 +731,7 @@ bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
for (int I = 0, E = NumDst; I != E; ++I) {
MachineOperand &Dst = MI.getOperand(I);
BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
- .addReg(SrcReg, 0, SubRegs[I]);
+ .addReg(SrcReg, {}, SubRegs[I]);
// Make sure the subregister index is valid for the source register.
SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
@@ -809,15 +831,13 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
.addImm(0xFFFF)
.addReg(Src0);
- if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
- return false;
+ constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)
.addReg(Src1)
.addImm(16)
.addReg(TmpReg);
- if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
- return false;
+ constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
MI.eraseFromParent();
return true;
@@ -863,7 +883,8 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
.setOperandDead(3); // Dead scc
MI.eraseFromParent();
- return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+ constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+ return true;
}
if (STI.hasSPackHL()) {
Opc = AMDGPU::S_PACK_HL_B32_B16;
@@ -872,7 +893,8 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
}
MI.setDesc(TII.get(Opc));
- return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
+ constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
+ return true;
}
bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
@@ -970,7 +992,8 @@ bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
.addReg(OffsetReg)
.addReg(WidthReg);
MI.eraseFromParent();
- return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+ constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+ return true;
}
bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
@@ -1072,7 +1095,8 @@ bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
MIB.addReg(VDstIn);
MI.eraseFromParent();
- return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+ constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+ return true;
}
// We need to handle this here because tablegen doesn't support matching
@@ -1113,7 +1137,8 @@ bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
.addImm(0); // $omod
MI.eraseFromParent();
- return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+ constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+ return true;
}
bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
@@ -1200,6 +1225,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
case Intrinsic::amdgcn_permlane16_swap:
case Intrinsic::amdgcn_permlane32_swap:
return selectPermlaneSwapIntrin(I, IntrinsicID);
+ case Intrinsic::amdgcn_wave_shuffle:
+ return selectWaveShuffleIntrin(I);
default:
return selectImpl(I, *CoverageInfo);
}
@@ -1468,8 +1495,8 @@ bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
.add(I.getOperand(3));
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
.addReg(AMDGPU::SCC);
+ constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
bool Ret =
- constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
I.eraseFromParent();
return Ret;
@@ -1499,9 +1526,9 @@ bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
*TRI.getBoolRC(), *MRI);
- bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
+ constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
I.eraseFromParent();
- return Ret;
+ return true;
}
bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
@@ -1555,8 +1582,7 @@ bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
SelectedMI.addImm(0); // op_sel
RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
- if (!constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI))
- return false;
+ constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI);
I.eraseFromParent();
return true;
@@ -1642,8 +1668,7 @@ bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
.addReg(SrcReg)
.addReg(TRI.getExec())
.setOperandDead(3); // Dead scc
- if (!constrainSelectedInstRegOperands(*And, TII, TRI, RBI))
- return false;
+ constrainSelectedInstRegOperands(*And, TII, TRI, RBI);
}
}
@@ -1710,7 +1735,8 @@ bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
}
I.eraseFromParent();
- return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+ constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+ return true;
}
bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
@@ -1834,9 +1860,9 @@ bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
return false;
- bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
+ constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
MI.eraseFromParent();
- return Ret;
+ return true;
}
static unsigned gwsIntrinToOpcode(unsigned IntrID) {
@@ -1930,20 +1956,52 @@ bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
// The resource id offset is computed as (<isa opaque base> + M0[21:16] +
// offset field) % 64. Some versions of the programming guide omit the m0
// part, or claim it's from offset 0.
- auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
+
+ unsigned Opc = gwsIntrinToOpcode(IID);
+ const MCInstrDesc &InstrDesc = TII.get(Opc);
if (HasVSrc) {
Register VSrc = MI.getOperand(1).getReg();
- MIB.addReg(VSrc);
- if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
- return false;
- }
+ int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
+ const TargetRegisterClass *DataRC = TII.getRegClass(InstrDesc, Data0Idx);
+ const TargetRegisterClass *SubRC =
+ TRI.getSubRegisterClass(DataRC, AMDGPU::sub0);
- MIB.addImm(ImmOffset)
- .cloneMemRefs(MI);
+ if (!SubRC) {
+ // 32-bit normal case.
+ if (!RBI.constrainGenericRegister(VSrc, *DataRC, *MRI))
+ return false;
- TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::data0);
+ BuildMI(*MBB, &MI, DL, InstrDesc)
+ .addReg(VSrc)
+ .addImm(ImmOffset)
+ .cloneMemRefs(MI);
+ } else {
+ // Requires even register alignment, so create 64-bit value and pad the
+ // top half with undef.
+ Register DataReg = MRI->createVirtualRegister(DataRC);
+ if (!RBI.constrainGenericRegister(VSrc, *SubRC, *MRI))
+ return false;
+
+ Register UndefReg = MRI->createVirtualRegister(SubRC);
+ BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
+ BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), DataReg)
+ .addReg(VSrc)
+ .addImm(AMDGPU::sub0)
+ .addReg(UndefReg)
+ .addImm(AMDGPU::sub1);
+
+ BuildMI(*MBB, &MI, DL, InstrDesc)
+ .addReg(DataReg)
+ .addImm(ImmOffset)
+ .cloneMemRefs(MI);
+ }
+ } else {
+ BuildMI(*MBB, &MI, DL, InstrDesc)
+ .addImm(ImmOffset)
+ .cloneMemRefs(MI);
+ }
MI.eraseFromParent();
return true;
@@ -1978,11 +2036,12 @@ bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
.addImm(IsGDS ? -1 : 0)
.cloneMemRefs(MI);
MI.eraseFromParent();
- return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+ constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+ return true;
}
bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
- MachineFunction *MF = MI.getParent()->getParent();
+ MachineFunction *MF = MI.getMF();
SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>();
MFInfo->setInitWholeWave();
@@ -2006,19 +2065,27 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
MachineBasicBlock *MBB = MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
+ unsigned IntrOpcode = Intr->BaseOpcode;
+
+ // For image atomic: use no-return opcode if result is unused.
+ if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode) {
+ Register ResultDef = MI.getOperand(0).getReg();
+ if (MRI->use_nodbg_empty(ResultDef))
+ IntrOpcode = Intr->AtomicNoRetBaseOpcode;
+ }
const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
- AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
+ AMDGPU::getMIMGBaseOpcodeInfo(IntrOpcode);
const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
- unsigned IntrOpcode = Intr->BaseOpcode;
const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
- Register VDataIn, VDataOut;
+ Register VDataIn = AMDGPU::NoRegister;
+ Register VDataOut = AMDGPU::NoRegister;
LLT VDataTy;
int NumVDataDwords = -1;
bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
@@ -2049,7 +2116,8 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
unsigned DMaskLanes = 0;
if (BaseOpcode->Atomic) {
- VDataOut = MI.getOperand(0).getReg();
+ if (!BaseOpcode->NoReturn)
+ VDataOut = MI.getOperand(0).getReg();
VDataIn = MI.getOperand(2).getReg();
LLT Ty = MRI->getType(VDataIn);
@@ -2099,8 +2167,9 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
- if (BaseOpcode->Atomic)
- CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
+ // Keep GLC only when the atomic's result is actually used.
+ if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
+ CPol |= AMDGPU::CPol::GLC;
if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
AMDGPU::CPol::VOLATILE))
return false;
@@ -2280,7 +2349,8 @@ bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
.cloneMemRefs(MI);
MI.eraseFromParent();
- return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+ constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+ return true;
}
bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
@@ -2306,17 +2376,29 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
case Intrinsic::amdgcn_init_whole_wave:
return selectInitWholeWave(I);
case Intrinsic::amdgcn_raw_buffer_load_lds:
+ case Intrinsic::amdgcn_raw_buffer_load_async_lds:
case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
+ case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
case Intrinsic::amdgcn_struct_buffer_load_lds:
+ case Intrinsic::amdgcn_struct_buffer_load_async_lds:
case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
+ case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
return selectBufferLoadLds(I);
// Until we can store both the address space of the global and the LDS
// arguments by having tto MachineMemOperands on an intrinsic, we just trust
// that the argument is a global pointer (buffer pointers have been handled by
// a LLVM IR-level lowering).
case Intrinsic::amdgcn_load_to_lds:
+ case Intrinsic::amdgcn_load_async_to_lds:
case Intrinsic::amdgcn_global_load_lds:
+ case Intrinsic::amdgcn_global_load_async_lds:
return selectGlobalLoadLds(I);
+ case Intrinsic::amdgcn_asyncmark:
+ case Intrinsic::amdgcn_wait_asyncmark:
+ // FIXME: Not supported on GFX12 yet. Will need a new feature when we do.
+ if (!Subtarget->hasVMemToLDSLoad())
+ return false;
+ break;
case Intrinsic::amdgcn_exp_compr:
if (!STI.hasCompressedExport()) {
Function &F = I.getMF()->getFunction();
@@ -2331,9 +2413,35 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
return selectDSBvhStackIntrinsic(I);
+ case Intrinsic::amdgcn_s_alloc_vgpr: {
+ // S_ALLOC_VGPR doesn't have a destination register, it just implicitly sets
+ // SCC. We then need to COPY it into the result vreg.
+ MachineBasicBlock *MBB = I.getParent();
+ const DebugLoc &DL = I.getDebugLoc();
+
+ Register ResReg = I.getOperand(0).getReg();
+
+ MachineInstr *AllocMI = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_ALLOC_VGPR))
+ .add(I.getOperand(2));
+ (void)BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), ResReg)
+ .addReg(AMDGPU::SCC);
+ I.eraseFromParent();
+ constrainSelectedInstRegOperands(*AllocMI, TII, TRI, RBI);
+ return RBI.constrainGenericRegister(ResReg, AMDGPU::SReg_32RegClass, *MRI);
+ }
case Intrinsic::amdgcn_s_barrier_init:
case Intrinsic::amdgcn_s_barrier_signal_var:
return selectNamedBarrierInit(I, IntrinsicID);
+ case Intrinsic::amdgcn_s_wakeup_barrier: {
+ if (!STI.hasSWakeupBarrier()) {
+ Function &F = I.getMF()->getFunction();
+ F.getContext().diagnose(
+ DiagnosticInfoUnsupported(F, "intrinsic not supported on subtarget",
+ I.getDebugLoc(), DS_Error));
+ return false;
+ }
+ return selectNamedBarrierInst(I, IntrinsicID);
+ }
case Intrinsic::amdgcn_s_barrier_join:
case Intrinsic::amdgcn_s_get_named_barrier_state:
return selectNamedBarrierInst(I, IntrinsicID);
@@ -2372,11 +2480,10 @@ bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
.add(I.getOperand(2))
.add(I.getOperand(3));
- bool Ret = false;
- Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
- Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
+ constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
+ constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
I.eraseFromParent();
- return Ret;
+ return true;
}
// Wide VGPR select should have been split in RegBankSelect.
@@ -2391,9 +2498,9 @@ bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
.add(I.getOperand(2))
.add(I.getOperand(1));
- bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
+ constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
I.eraseFromParent();
- return Ret;
+ return true;
}
bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
@@ -2438,7 +2545,7 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
const DebugLoc &DL = I.getDebugLoc();
MachineBasicBlock *MBB = I.getParent();
BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), DstReg)
- .addReg(SrcReg, 0, AMDGPU::lo16);
+ .addReg(SrcReg, {}, AMDGPU::lo16);
I.eraseFromParent();
return true;
}
@@ -2450,9 +2557,9 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
Register LoReg = MRI->createVirtualRegister(DstRC);
Register HiReg = MRI->createVirtualRegister(DstRC);
BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
- .addReg(SrcReg, 0, AMDGPU::sub0);
+ .addReg(SrcReg, {}, AMDGPU::sub0);
BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
- .addReg(SrcReg, 0, AMDGPU::sub1);
+ .addReg(SrcReg, {}, AMDGPU::sub1);
if (IsVALU && STI.hasSDWA()) {
// Write the low 16-bits of the high element into the high 16-bits of the
@@ -2609,7 +2716,8 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
.addImm(Mask)
.addReg(SrcReg);
I.eraseFromParent();
- return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
+ constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
+ return true;
}
const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
@@ -2619,7 +2727,8 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
.addImm(0) // Offset
.addImm(SrcSize); // Width
I.eraseFromParent();
- return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
+ constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
+ return true;
}
if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
@@ -2644,18 +2753,18 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
if (Signed) {
BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)
- .addReg(SrcReg, 0, SubReg)
- .addImm(31)
- .setOperandDead(3); // Dead scc
+ .addReg(SrcReg, {}, SubReg)
+ .addImm(31)
+ .setOperandDead(3); // Dead scc
} else {
BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
.addImm(0);
}
BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
- .addReg(SrcReg, 0, SubReg)
- .addImm(AMDGPU::sub0)
- .addReg(HiReg)
- .addImm(AMDGPU::sub1);
+ .addReg(SrcReg, {}, SubReg)
+ .addImm(AMDGPU::sub0)
+ .addReg(HiReg)
+ .addImm(AMDGPU::sub1);
I.eraseFromParent();
return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,
*MRI);
@@ -2673,10 +2782,10 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
- .addReg(SrcReg, 0, SubReg)
- .addImm(AMDGPU::sub0)
- .addReg(UndefReg)
- .addImm(AMDGPU::sub1);
+ .addReg(SrcReg, {}, SubReg)
+ .addImm(AMDGPU::sub0)
+ .addReg(UndefReg)
+ .addImm(AMDGPU::sub1);
BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
.addReg(ExtReg)
@@ -2810,9 +2919,9 @@ bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
- .addReg(Src, 0, AMDGPU::sub0);
+ .addReg(Src, {}, AMDGPU::sub0);
BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
- .addReg(Src, 0, AMDGPU::sub1);
+ .addReg(Src, {}, AMDGPU::sub1);
BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
.addImm(0x80000000);
@@ -2852,9 +2961,9 @@ bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
return false;
BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
- .addReg(Src, 0, AMDGPU::sub0);
+ .addReg(Src, {}, AMDGPU::sub0);
BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
- .addReg(Src, 0, AMDGPU::sub1);
+ .addReg(Src, {}, AMDGPU::sub1);
BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
.addImm(0x7fffffff);
@@ -3093,7 +3202,8 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
.addReg(MaskReg)
.setOperandDead(3); // Dead scc
I.eraseFromParent();
- return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+ constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+ return true;
}
unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
@@ -3129,9 +3239,9 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
// Extract the subregisters from the source pointer.
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
- .addReg(SrcReg, 0, AMDGPU::sub0);
+ .addReg(SrcReg, {}, AMDGPU::sub0);
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
- .addReg(SrcReg, 0, AMDGPU::sub1);
+ .addReg(SrcReg, {}, AMDGPU::sub1);
Register MaskedLo, MaskedHi;
@@ -3144,7 +3254,7 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
MaskedLo = MRI->createVirtualRegister(&RegRC);
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
- .addReg(MaskReg, 0, AMDGPU::sub0);
+ .addReg(MaskReg, {}, AMDGPU::sub0);
BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
.addReg(LoReg)
.addReg(MaskLo);
@@ -3158,7 +3268,7 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
MaskedHi = MRI->createVirtualRegister(&RegRC);
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
- .addReg(MaskReg, 0, AMDGPU::sub1);
+ .addReg(MaskReg, {}, AMDGPU::sub1);
BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
.addReg(HiReg)
.addReg(MaskHi);
@@ -3246,8 +3356,8 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
- .addReg(SrcReg, 0, SubReg)
- .addReg(SrcReg, RegState::Implicit);
+ .addReg(SrcReg, {}, SubReg)
+ .addReg(SrcReg, RegState::Implicit);
MI.eraseFromParent();
return true;
}
@@ -3259,8 +3369,8 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
.addReg(IdxReg);
BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
- .addReg(SrcReg, 0, SubReg)
- .addReg(SrcReg, RegState::Implicit);
+ .addReg(SrcReg, {}, SubReg)
+ .addReg(SrcReg, RegState::Implicit);
MI.eraseFromParent();
return true;
}
@@ -3350,11 +3460,25 @@ bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
return true;
}
+static bool isAsyncLDSDMA(Intrinsic::ID Intr) {
+ switch (Intr) {
+ case Intrinsic::amdgcn_raw_buffer_load_async_lds:
+ case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
+ case Intrinsic::amdgcn_struct_buffer_load_async_lds:
+ case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
+ case Intrinsic::amdgcn_load_async_to_lds:
+ case Intrinsic::amdgcn_global_load_async_lds:
+ return true;
+ }
+ return false;
+}
+
bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
if (!Subtarget->hasVMemToLDSLoad())
return false;
unsigned Opc;
unsigned Size = MI.getOperand(3).getImm();
+ Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
// The struct intrinsic variants add one additional operand over raw.
const bool HasVIndex = MI.getNumOperands() == 9;
@@ -3444,12 +3568,17 @@ bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
? 1
: 0); // swz
+ MIB.addImm(isAsyncLDSDMA(IntrinsicID));
MachineMemOperand *LoadMMO = *MI.memoperands_begin();
+ // Don't set the offset value here because the pointer points to the base of
+ // the buffer.
MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
- LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm();
+
MachinePointerInfo StorePtrI = LoadPtrI;
- StorePtrI.V = nullptr;
+ LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
+ AMDGPUAS::BUFFER_RESOURCE));
+ LoadPtrI.AddrSpace = AMDGPUAS::BUFFER_RESOURCE;
StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
auto F = LoadMMO->getFlags() &
@@ -3464,7 +3593,8 @@ bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
MIB.setMemRefs({LoadMMO, StoreMMO});
MI.eraseFromParent();
- return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+ constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+ return true;
}
/// Match a zero extend from a 32-bit value to 64-bits.
@@ -3561,6 +3691,7 @@ bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
unsigned Opc;
unsigned Size = MI.getOperand(3).getImm();
+ Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
switch (Size) {
default:
@@ -3627,13 +3758,18 @@ bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
if (isSGPR(Addr))
MIB.addReg(VOffset);
- MIB.add(MI.getOperand(4)) // offset
- .add(MI.getOperand(5)); // cpol
+ MIB.add(MI.getOperand(4)); // offset
+
+ unsigned Aux = MI.getOperand(5).getImm();
+ MIB.addImm(Aux & ~AMDGPU::CPol::VIRTUAL_BITS); // cpol
+ MIB.addImm(isAsyncLDSDMA(IntrinsicID));
MachineMemOperand *LoadMMO = *MI.memoperands_begin();
MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
LoadPtrI.Offset = MI.getOperand(4).getImm();
MachinePointerInfo StorePtrI = LoadPtrI;
+ LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
+ AMDGPUAS::GLOBAL_ADDRESS));
LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
auto F = LoadMMO->getFlags() &
@@ -3647,7 +3783,8 @@ bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
MIB.setMemRefs({LoadMMO, StoreMMO});
MI.eraseFromParent();
- return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+ constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+ return true;
}
bool AMDGPUInstructionSelector::selectBVHIntersectRayIntrinsic(
@@ -3656,8 +3793,9 @@ bool AMDGPUInstructionSelector::selectBVHIntersectRayIntrinsic(
MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY ? 1 : 3;
MI.setDesc(TII.get(MI.getOperand(OpcodeOpIdx).getImm()));
MI.removeOperand(OpcodeOpIdx);
- MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
- return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
+ MI.addImplicitDefUseOperands(*MI.getMF());
+ constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
+ return true;
}
// FIXME: This should be removed and let the patterns select. We just need the
@@ -3759,7 +3897,11 @@ bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
MI.removeOperand(4); // VDst_In
MI.removeOperand(1); // Intrinsic ID
MI.addOperand(VDst_In); // Readd VDst_In to the end
- MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
+ MI.addImplicitDefUseOperands(*MI.getMF());
+ const MCInstrDesc &MCID = MI.getDesc();
+ if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
+ MI.getOperand(0).setIsEarlyClobber(true);
+ }
return true;
}
@@ -3783,7 +3925,8 @@ bool AMDGPUInstructionSelector::selectPermlaneSwapIntrin(
MachineOperand &FI = MI.getOperand(4);
FI.setImm(FI.getImm() ? AMDGPU::DPP::DPP_FI_1 : AMDGPU::DPP::DPP_FI_0);
- return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
+ constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
+ return true;
}
bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
@@ -3814,6 +3957,133 @@ bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
return true;
}
+bool AMDGPUInstructionSelector::selectWaveShuffleIntrin(
+ MachineInstr &MI) const {
+ assert(MI.getNumOperands() == 4);
+ MachineBasicBlock *MBB = MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ Register DstReg = MI.getOperand(0).getReg();
+ Register ValReg = MI.getOperand(2).getReg();
+ Register IdxReg = MI.getOperand(3).getReg();
+
+ const LLT DstTy = MRI->getType(DstReg);
+ unsigned DstSize = DstTy.getSizeInBits();
+ const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
+ const TargetRegisterClass *DstRC =
+ TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
+
+ if (DstTy != LLT::scalar(32))
+ return false;
+
+ if (!Subtarget->supportsBPermute())
+ return false;
+
+ // If we can bpermute across the whole wave, then just do that
+ if (Subtarget->supportsWaveWideBPermute()) {
+ Register ShiftIdxReg = MRI->createVirtualRegister(DstRC);
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), ShiftIdxReg)
+ .addImm(2)
+ .addReg(IdxReg);
+
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), DstReg)
+ .addReg(ShiftIdxReg)
+ .addReg(ValReg)
+ .addImm(0);
+ } else {
+ // Otherwise, we need to make use of whole wave mode
+ assert(Subtarget->isWave64());
+
+ // Set inactive lanes to poison
+ Register UndefValReg =
+ MRI->createVirtualRegister(TRI.getRegClass(AMDGPU::SReg_32RegClassID));
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefValReg);
+
+ Register UndefExecReg = MRI->createVirtualRegister(
+ TRI.getRegClass(AMDGPU::SReg_64_XEXECRegClassID));
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefExecReg);
+
+ Register PoisonValReg = MRI->createVirtualRegister(DstRC);
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SET_INACTIVE_B32), PoisonValReg)
+ .addImm(0)
+ .addReg(ValReg)
+ .addImm(0)
+ .addReg(UndefValReg)
+ .addReg(UndefExecReg);
+
+ // ds_bpermute requires index to be multiplied by 4
+ Register ShiftIdxReg = MRI->createVirtualRegister(DstRC);
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), ShiftIdxReg)
+ .addImm(2)
+ .addReg(IdxReg);
+
+ Register PoisonIdxReg = MRI->createVirtualRegister(DstRC);
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SET_INACTIVE_B32), PoisonIdxReg)
+ .addImm(0)
+ .addReg(ShiftIdxReg)
+ .addImm(0)
+ .addReg(UndefValReg)
+ .addReg(UndefExecReg);
+
+ // Get permutation of each half, then we'll select which one to use
+ Register SameSidePermReg = MRI->createVirtualRegister(DstRC);
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), SameSidePermReg)
+ .addReg(PoisonIdxReg)
+ .addReg(PoisonValReg)
+ .addImm(0);
+
+ Register SwappedValReg = MRI->createVirtualRegister(DstRC);
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_PERMLANE64_B32), SwappedValReg)
+ .addReg(PoisonValReg);
+
+ Register OppSidePermReg = MRI->createVirtualRegister(DstRC);
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), OppSidePermReg)
+ .addReg(PoisonIdxReg)
+ .addReg(SwappedValReg)
+ .addImm(0);
+
+ Register WWMSwapPermReg = MRI->createVirtualRegister(DstRC);
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::STRICT_WWM), WWMSwapPermReg)
+ .addReg(OppSidePermReg);
+
+ // Select which side to take the permute from
+ // We can get away with only using mbcnt_lo here since we're only
+ // trying to detect which side of 32 each lane is on, and mbcnt_lo
+ // returns 32 for lanes 32-63.
+ Register ThreadIDReg = MRI->createVirtualRegister(DstRC);
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MBCNT_LO_U32_B32_e64), ThreadIDReg)
+ .addImm(-1)
+ .addImm(0);
+
+ Register XORReg = MRI->createVirtualRegister(DstRC);
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_XOR_B32_e64), XORReg)
+ .addReg(ThreadIDReg)
+ .addReg(PoisonIdxReg);
+
+ Register ANDReg = MRI->createVirtualRegister(DstRC);
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_AND_B32_e64), ANDReg)
+ .addReg(XORReg)
+ .addImm(32);
+
+ Register CompareReg = MRI->createVirtualRegister(
+ TRI.getRegClass(AMDGPU::SReg_64_XEXECRegClassID));
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), CompareReg)
+ .addReg(ANDReg)
+ .addImm(0);
+
+ // Finally do the selection
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+ .addImm(0)
+ .addReg(WWMSwapPermReg)
+ .addImm(0)
+ .addReg(SameSidePermReg)
+ .addReg(CompareReg);
+ }
+
+ MI.eraseFromParent();
+ return true;
+}
+
// Match BITOP3 operation and return a number of matched instructions plus
// truth table.
static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,
@@ -3891,7 +4161,7 @@ static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,
SmallVector<Register, 3> Backup(Src.begin(), Src.end());
if (!getOperandBits(LHS, LHSBits) ||
!getOperandBits(RHS, RHSBits)) {
- Src = Backup;
+ Src = std::move(Backup);
return std::make_pair(0, 0);
}
@@ -4131,6 +4401,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
case TargetOpcode::G_ATOMICRMW_UMAX:
case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
+ case TargetOpcode::G_ATOMICRMW_USUB_COND:
+ case TargetOpcode::G_ATOMICRMW_USUB_SAT:
case TargetOpcode::G_ATOMICRMW_FADD:
case TargetOpcode::G_ATOMICRMW_FMIN:
case TargetOpcode::G_ATOMICRMW_FMAX:
@@ -6726,7 +6998,7 @@ bool AMDGPUInstructionSelector::selectSGetBarrierState(
MachineInstr &I, Intrinsic::ID IntrID) const {
MachineBasicBlock *MBB = I.getParent();
const DebugLoc &DL = I.getDebugLoc();
- MachineOperand BarOp = I.getOperand(2);
+ const MachineOperand &BarOp = I.getOperand(2);
std::optional<int64_t> BarValImm =
getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
@@ -6760,6 +7032,8 @@ unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
llvm_unreachable("not a named barrier op");
case Intrinsic::amdgcn_s_barrier_join:
return AMDGPU::S_BARRIER_JOIN_IMM;
+ case Intrinsic::amdgcn_s_wakeup_barrier:
+ return AMDGPU::S_WAKEUP_BARRIER_IMM;
case Intrinsic::amdgcn_s_get_named_barrier_state:
return AMDGPU::S_GET_BARRIER_STATE_IMM;
};
@@ -6769,6 +7043,8 @@ unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
llvm_unreachable("not a named barrier op");
case Intrinsic::amdgcn_s_barrier_join:
return AMDGPU::S_BARRIER_JOIN_M0;
+ case Intrinsic::amdgcn_s_wakeup_barrier:
+ return AMDGPU::S_WAKEUP_BARRIER_M0;
case Intrinsic::amdgcn_s_get_named_barrier_state:
return AMDGPU::S_GET_BARRIER_STATE_M0;
};
@@ -6779,8 +7055,8 @@ bool AMDGPUInstructionSelector::selectNamedBarrierInit(
MachineInstr &I, Intrinsic::ID IntrID) const {
MachineBasicBlock *MBB = I.getParent();
const DebugLoc &DL = I.getDebugLoc();
- MachineOperand BarOp = I.getOperand(1);
- MachineOperand CntOp = I.getOperand(2);
+ const MachineOperand &BarOp = I.getOperand(1);
+ const MachineOperand &CntOp = I.getOperand(2);
// BarID = (BarOp >> 4) & 0x3F
Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index c760fe7..627cce2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -156,6 +156,7 @@ private:
bool selectSBarrierSignalIsfirst(MachineInstr &I, Intrinsic::ID IID) const;
bool selectSGetBarrierState(MachineInstr &I, Intrinsic::ID IID) const;
bool selectSBarrierLeave(MachineInstr &I) const;
+ bool selectWaveShuffleIntrin(MachineInstr &I) const;
std::pair<Register, unsigned> selectVOP3ModsImpl(Register Src,
bool IsCanonicalizing = true,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index bd443b5..f77b4c9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -636,15 +636,11 @@ multiclass local_addr_space_atomic_op {
}
}
-defm int_amdgcn_global_atomic_csub : noret_op;
defm int_amdgcn_global_atomic_ordered_add_b64 : noret_op;
defm int_amdgcn_flat_atomic_fmin_num : noret_op;
defm int_amdgcn_flat_atomic_fmax_num : noret_op;
defm int_amdgcn_global_atomic_fmin_num : noret_op;
defm int_amdgcn_global_atomic_fmax_num : noret_op;
-defm int_amdgcn_atomic_cond_sub_u32 : local_addr_space_atomic_op;
-defm int_amdgcn_atomic_cond_sub_u32 : flat_addr_space_atomic_op;
-defm int_amdgcn_atomic_cond_sub_u32 : global_addr_space_atomic_op;
multiclass noret_binary_atomic_op<SDNode atomic_op> {
let HasNoUse = true in
@@ -695,6 +691,8 @@ defm atomic_load_fmin : binary_atomic_op_fp_all_as<atomic_load_fmin>;
defm atomic_load_fmax : binary_atomic_op_fp_all_as<atomic_load_fmax>;
defm atomic_load_uinc_wrap : binary_atomic_op_all_as<atomic_load_uinc_wrap>;
defm atomic_load_udec_wrap : binary_atomic_op_all_as<atomic_load_udec_wrap>;
+defm atomic_load_usub_cond : binary_atomic_op_all_as<atomic_load_usub_cond>;
+defm atomic_load_usub_sat : binary_atomic_op_all_as<atomic_load_usub_sat>;
defm AMDGPUatomic_cmp_swap : binary_atomic_op_all_as<AMDGPUatomic_cmp_swap>;
def load_align8_local : PatFrag<(ops node:$ptr), (load_local node:$ptr)>,
@@ -745,23 +743,14 @@ int FP32_ONE = 0x3f800000;
int FP32_NEG_ONE = 0xbf800000;
int FP64_ONE = 0x3ff0000000000000;
int FP64_NEG_ONE = 0xbff0000000000000;
+int BF16_ONE = 0x3F80;
+int BF16_NEG_ONE = 0xBF80;
}
def CONST : Constants;
-def FP_ZERO : PatLeaf <
- (fpimm),
- [{return N->getValueAPF().isZero();}]
->;
-
-def FP_ONE : PatLeaf <
- (fpimm),
- [{return N->isExactlyValue(1.0);}]
->;
-
-def FP_HALF : PatLeaf <
- (fpimm),
- [{return N->isExactlyValue(0.5);}]
->;
+def fpimm_zero : FPImmLeaf<fAny, [{ return Imm.isZero(); }]> ;
+def fpimm_one : FPImmLeaf<fAny, [{ return Imm.isExactlyValue(+1.0); }]> ;
+def fpimm_half : FPImmLeaf<fAny, [{ return Imm.isExactlyValue(+0.5); }]> ;
/* Generic helper patterns for intrinsics */
/* -------------------------------------- */
@@ -806,24 +795,17 @@ class DwordAddrPat<ValueType vt, RegisterClass rc> : AMDGPUPat <
(vt rc:$addr)
>;
-// rotr pattern
-class ROTRPattern <Instruction BIT_ALIGN> : AMDGPUPat <
- (rotr i32:$src0, i32:$src1),
- (BIT_ALIGN $src0, $src0, $src1)
->;
-
// Special conversion patterns
-def cvt_rpi_i32_f32 : PatFrag <
+let GIIgnoreCopies = 1 in
+def cvt_rpi_i32_f32 : PatFrag<
(ops node:$src),
- (fp_to_sint (ffloor (fadd $src, FP_HALF))),
- [{ (void) N; return TM.Options.NoNaNsFPMath; }]
->;
+ (fp_to_sint (ffloor_nnan (fadd $src, fpimm_half)))
+>, GISelFlags;
-def cvt_flr_i32_f32 : PatFrag <
+def cvt_flr_i32_f32 : PatFrag<
(ops node:$src),
- (fp_to_sint (ffloor $src)),
- [{ (void)N; return TM.Options.NoNaNsFPMath; }]
+ (fp_to_sint (ffloor_nnan $src))
>;
let AddedComplexity = 2 in {
@@ -841,7 +823,7 @@ class UMad24Pat<Instruction Inst, bit HasClamp = 0> : AMDGPUPat <
} // AddedComplexity.
class RcpPat<Instruction RcpInst, ValueType vt> : AMDGPUPat <
- (fdiv FP_ONE, vt:$src),
+ (fdiv fpimm_one, vt:$src),
(RcpInst $src)
>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULaneMaskUtils.h b/llvm/lib/Target/AMDGPU/AMDGPULaneMaskUtils.h
index df80196..95d88c7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULaneMaskUtils.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULaneMaskUtils.h
@@ -31,10 +31,12 @@ public:
const unsigned AndSaveExecTermOpc;
const unsigned BfmOpc;
const unsigned CMovOpc;
+ const unsigned CmpLGOp;
const unsigned CSelectOpc;
const unsigned MovOpc;
const unsigned MovTermOpc;
const unsigned OrOpc;
+ const unsigned OrN2Op;
const unsigned OrTermOpc;
const unsigned OrSaveExecOpc;
const unsigned XorOpc;
@@ -57,10 +59,12 @@ public:
: AMDGPU::S_AND_SAVEEXEC_B64_term),
BfmOpc(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64),
CMovOpc(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
+ CmpLGOp(IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64),
CSelectOpc(IsWave32 ? AMDGPU::S_CSELECT_B32 : AMDGPU::S_CSELECT_B64),
MovOpc(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
MovTermOpc(IsWave32 ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term),
OrOpc(IsWave32 ? AMDGPU::S_OR_B32 : AMDGPU::S_OR_B64),
+ OrN2Op(IsWave32 ? AMDGPU::S_ORN2_B32 : AMDGPU::S_ORN2_B64),
OrTermOpc(IsWave32 ? AMDGPU::S_OR_B32_term : AMDGPU::S_OR_B64_term),
OrSaveExecOpc(IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32
: AMDGPU::S_OR_SAVEEXEC_B64),
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 7504f1a..63e2656 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -126,7 +126,38 @@ public:
return LK.first != TargetLoweringBase::TypeLegal;
}
- bool isOpLegal(Instruction *I) { return isa<StoreInst, IntrinsicInst>(I); }
+ bool isOpLegal(const Instruction *I) {
+ if (isa<IntrinsicInst>(I))
+ return true;
+
+ // Any store is a profitable sink (prevents flip-flopping)
+ if (isa<StoreInst>(I))
+ return true;
+
+ if (auto *BO = dyn_cast<BinaryOperator>(I)) {
+ if (auto *VT = dyn_cast<FixedVectorType>(BO->getType())) {
+ if (const auto *IT = dyn_cast<IntegerType>(VT->getElementType())) {
+ unsigned EB = IT->getBitWidth();
+ unsigned EC = VT->getNumElements();
+ // Check for SDWA-compatible operation
+ if ((EB == 8 || EB == 16) && ST.hasSDWA() && EC * EB <= 32) {
+ switch (BO->getOpcode()) {
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ return true;
+ default:
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ return false;
+ }
bool isCoercionProfitable(Instruction *II) {
SmallPtrSet<Instruction *, 4> CVisited;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 596a895..5a993a4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -30,6 +30,8 @@
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/PseudoSourceValueManager.h"
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
@@ -411,7 +413,7 @@ static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
switch (AS) {
case AMDGPUAS::PRIVATE_ADDRESS:
// FIXME: Private element size.
- return ST.enableFlatScratch() ? 128 : 32;
+ return ST.hasFlatScratchEnabled() ? 128 : 32;
case AMDGPUAS::LOCAL_ADDRESS:
return ST.useDS128() ? 128 : 64;
case AMDGPUAS::GLOBAL_ADDRESS:
@@ -750,7 +752,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
.scalarize(0);
- if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
+ if (ST.hasVOP3PInsts() && ST.hasAddNoCarryInsts() && ST.hasIntClamp()) {
// Full set of gfx9 features.
if (ST.hasScalarAddSub64()) {
getActionDefinitionsBuilder({G_ADD, G_SUB})
@@ -976,9 +978,25 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
}
+ auto &MinNumMaxNumIeee =
+ getActionDefinitionsBuilder({G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
+
+ if (ST.hasVOP3PInsts()) {
+ MinNumMaxNumIeee.legalFor(FPTypesPK16)
+ .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
+ .clampMaxNumElements(0, S16, 2)
+ .clampScalar(0, S16, S64)
+ .scalarize(0);
+ } else if (ST.has16BitInsts()) {
+ MinNumMaxNumIeee.legalFor(FPTypes16).clampScalar(0, S16, S64).scalarize(0);
+ } else {
+ MinNumMaxNumIeee.legalFor(FPTypesBase)
+ .clampScalar(0, S32, S64)
+ .scalarize(0);
+ }
+
auto &MinNumMaxNum = getActionDefinitionsBuilder(
- {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM, G_FMINNUM_IEEE,
- G_FMAXNUM_IEEE});
+ {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM});
if (ST.hasVOP3PInsts()) {
MinNumMaxNum.customFor(FPTypesPK16)
@@ -1039,6 +1057,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
.scalarize(0)
.lower();
+
+ getActionDefinitionsBuilder(G_FMODF)
+ .lowerFor({S16, S32, S64})
+ .scalarize(0)
+ .lower();
} else {
getActionDefinitionsBuilder(G_FSQRT)
.customFor({S32, S64, S16})
@@ -1072,6 +1095,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.minScalar(0, S32)
.clampScalar(1, S32, S32)
.lower();
+
+ getActionDefinitionsBuilder(G_FMODF)
+ .lowerFor({S32, S64})
+ .scalarize(0)
+ .lower();
}
auto &FPTruncActions = getActionDefinitionsBuilder(G_FPTRUNC);
@@ -1171,6 +1199,17 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.scalarize(0)
.lower();
+ // clang-format off
+ auto &FPToISat = getActionDefinitionsBuilder({G_FPTOSI_SAT, G_FPTOUI_SAT})
+ .legalFor({{S32, S32}, {S32, S64}})
+ .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
+ FPToISat.minScalar(1, S32);
+ FPToISat.minScalar(0, S32)
+ .widenScalarToNextPow2(0, 32)
+ .scalarize(0)
+ .lower();
+ // clang-format on
+
getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
.clampScalar(0, S16, S64)
.scalarize(0)
@@ -1705,6 +1744,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
}
+ auto &Atomics32 =
+ getActionDefinitionsBuilder({G_ATOMICRMW_USUB_COND, G_ATOMICRMW_USUB_SAT})
+ .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, {S32, RegionPtr}});
+ if (ST.hasFlatAddressSpace()) {
+ Atomics32.legalFor({{S32, FlatPtr}});
+ }
+
// TODO: v2bf16 operations, and fat buffer pointer support.
auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
if (ST.hasLDSFPAtomicAddF32()) {
@@ -2136,9 +2182,17 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.legalFor(FPTypesPK16)
.clampMaxNumElements(0, S16, 2)
.scalarize(0);
+ } else if (ST.hasVOP3PInsts()) {
+ getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
+ .lowerFor({V2S16})
+ .clampMaxNumElementsStrict(0, S16, 2)
+ .scalarize(0)
+ .lower();
} else {
- // TODO: Implement
- getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
+ getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
+ .scalarize(0)
+ .clampScalar(0, S32, S64)
+ .lower();
}
getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
@@ -2195,8 +2249,6 @@ bool AMDGPULegalizerInfo::legalizeCustom(
case TargetOpcode::G_FMAXNUM:
case TargetOpcode::G_FMINIMUMNUM:
case TargetOpcode::G_FMAXIMUMNUM:
- case TargetOpcode::G_FMINNUM_IEEE:
- case TargetOpcode::G_FMAXNUM_IEEE:
return legalizeMinNumMaxNum(Helper, MI);
case TargetOpcode::G_EXTRACT_VECTOR_ELT:
return legalizeExtractVectorElt(MI, MRI, B);
@@ -2299,14 +2351,14 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
return B.buildUnmerge(S32, Dst).getReg(1);
}
- // TODO: can we be smarter about machine pointer info?
- MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
Register LoadAddr = MRI.createGenericVirtualRegister(
LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
// For code object version 5, private_base and shared_base are passed through
// implicit kernargs.
if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >=
AMDGPU::AMDHSA_COV5) {
+ MachinePointerInfo PtrInfo = getKernargSegmentPtrInfo(B.getMF());
+
AMDGPUTargetLowering::ImplicitParameter Param =
AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
: AMDGPUTargetLowering::PRIVATE_BASE;
@@ -2321,7 +2373,7 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
return Register();
MachineMemOperand *MMO = MF.getMachineMemOperand(
- PtrInfo,
+ PtrInfo.getWithOffset(Offset),
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant,
LLT::scalar(32), commonAlignment(Align(64), Offset));
@@ -2339,6 +2391,9 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
return Register();
+ // TODO: Use custom PseudoSourceValue
+ MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
+
// Offset into amd_queue_t for group_segment_aperture_base_hi /
// private_segment_aperture_base_hi.
uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
@@ -2538,8 +2593,14 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
uint32_t AddrHiVal = Info->get32BitAddressHighBits();
auto PtrLo = B.buildPtrToInt(S32, Src);
- auto HighAddr = B.buildConstant(S32, AddrHiVal);
- B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
+ if (AddrHiVal == 0) {
+ auto Zext = B.buildZExt(LLT::scalar(64), PtrLo);
+ B.buildIntToPtr(Dst, Zext);
+ } else {
+ auto HighAddr = B.buildConstant(S32, AddrHiVal);
+ B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
+ }
+
MI.eraseFromParent();
return true;
}
@@ -2817,23 +2878,8 @@ bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
MachineFunction &MF = Helper.MIRBuilder.getMF();
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
- MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
-
- // With ieee_mode disabled, the instructions have the correct behavior
- // already for G_FMINIMUMNUM/G_FMAXIMUMNUM.
- //
- // FIXME: G_FMINNUM/G_FMAXNUM should match the behavior with ieee_mode
- // enabled.
- if (!MFI->getMode().IEEE) {
- if (MI.getOpcode() == AMDGPU::G_FMINIMUMNUM ||
- MI.getOpcode() == AMDGPU::G_FMAXIMUMNUM)
- return true;
-
- return !IsIEEEOp;
- }
-
- if (IsIEEEOp)
+ // With ieee_mode disabled, the instructions have the correct behavior.
+ if (!MFI->getMode().IEEE)
return true;
return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
@@ -3145,16 +3191,16 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
return true; // Leave in place;
}
+ const GlobalVariable &GVar = *cast<GlobalVariable>(GV);
if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
- Type *Ty = GV->getValueType();
// HIP uses an unsized array `extern __shared__ T s[]` or similar
// zero-sized type in other languages to declare the dynamic shared
// memory which size is not known at the compile time. They will be
// allocated by the runtime and placed directly after the static
// allocated ones. They all share the same offset.
- if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
+ if (GVar.getGlobalSize(GVar.getDataLayout()) == 0) {
// Adjust alignment for that dynamic shared memory array.
- MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV));
+ MFI->setDynLDSAlign(MF.getFunction(), GVar);
LLT S32 = LLT::scalar(32);
auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
B.buildIntToPtr(DstReg, Sz);
@@ -3163,8 +3209,7 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
}
}
- B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
- *cast<GlobalVariable>(GV)));
+ B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), GVar));
MI.eraseFromParent();
return true;
}
@@ -3383,6 +3428,10 @@ static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI,
case TargetOpcode::G_INTRINSIC: {
switch (cast<GIntrinsic>(DefMI)->getIntrinsicID()) {
case Intrinsic::amdgcn_frexp_mant:
+ case Intrinsic::amdgcn_log:
+ case Intrinsic::amdgcn_log_clamp:
+ case Intrinsic::amdgcn_exp2:
+ case Intrinsic::amdgcn_sqrt:
return true;
default:
break;
@@ -3390,6 +3439,8 @@ static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI,
break;
}
+ case TargetOpcode::G_FSQRT:
+ return true;
case TargetOpcode::G_FFREXP: {
if (DefMI->getOperand(0).getReg() == Src)
return true;
@@ -3503,14 +3554,10 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
Register X = MI.getOperand(1).getReg();
unsigned Flags = MI.getFlags();
const LLT Ty = MRI.getType(X);
- MachineFunction &MF = B.getMF();
const LLT F32 = LLT::scalar(32);
const LLT F16 = LLT::scalar(16);
- const AMDGPUTargetMachine &TM =
- static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
-
if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn)) {
if (Ty == F16 && !ST.has16BitInsts()) {
Register LogVal = MRI.createGenericVirtualRegister(F32);
@@ -3544,12 +3591,14 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
-
- R = B.buildFMul(Ty, Y, C, Flags).getReg(0);
- auto NegR = B.buildFNeg(Ty, R, Flags);
- auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags);
- auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags);
- R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
+ // This adds correction terms for which contraction may lead to an increase
+ // in the error of the approximation, so disable it.
+ auto NewFlags = Flags & ~(MachineInstr::FmContract);
+ R = B.buildFMul(Ty, Y, C, NewFlags).getReg(0);
+ auto NegR = B.buildFNeg(Ty, R, NewFlags);
+ auto FMA0 = B.buildFMA(Ty, Y, C, NegR, NewFlags);
+ auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, NewFlags);
+ R = B.buildFAdd(Ty, R, FMA1, NewFlags).getReg(0);
} else {
// ch+ct is ln(2)/ln(10) to more than 36 bits
const float ch_log10 = 0x1.344000p-2f;
@@ -3565,17 +3614,19 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
auto MaskConst = B.buildConstant(Ty, 0xfffff000);
auto YH = B.buildAnd(Ty, Y, MaskConst);
auto YT = B.buildFSub(Ty, Y, YH, Flags);
- auto YTCT = B.buildFMul(Ty, YT, CT, Flags);
+ // This adds correction terms for which contraction may lead to an increase
+ // in the error of the approximation, so disable it.
+ auto NewFlags = Flags & ~(MachineInstr::FmContract);
+ auto YTCT = B.buildFMul(Ty, YT, CT, NewFlags);
Register Mad0 =
- getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
- Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags);
- R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags);
+ getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), NewFlags);
+ Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, NewFlags);
+ R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, NewFlags);
}
const bool IsFiniteOnly =
- (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) &&
- MI.getFlag(MachineInstr::FmNoInfs);
+ MI.getFlag(MachineInstr::FmNoNans) && MI.getFlag(MachineInstr::FmNoInfs);
if (!IsFiniteOnly) {
// Expand isfinite(x) => fabs(x) < inf
@@ -3699,24 +3750,39 @@ bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI,
return true;
}
+static MachineInstrBuilder buildExp(MachineIRBuilder &B, const DstOp &Dst,
+ const SrcOp &Src, unsigned Flags) {
+ LLT Ty = Dst.getLLTTy(*B.getMRI());
+
+ if (Ty == LLT::scalar(32)) {
+ return B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Dst})
+ .addUse(Src.getReg())
+ .setMIFlags(Flags);
+ }
+ return B.buildFExp2(Dst, Src, Flags);
+}
+
+bool AMDGPULegalizerInfo::legalizeFExpUnsafeImpl(MachineIRBuilder &B,
+ Register Dst, Register X,
+ unsigned Flags,
+ bool IsExp10) const {
+ LLT Ty = B.getMRI()->getType(X);
+
+ // exp(x) -> exp2(M_LOG2E_F * x);
+ // exp10(x) -> exp2(log2(10) * x);
+ auto Const = B.buildFConstant(Ty, IsExp10 ? 0x1.a934f0p+1f : numbers::log2e);
+ auto Mul = B.buildFMul(Ty, X, Const, Flags);
+ buildExp(B, Dst, Mul, Flags);
+ return true;
+}
+
bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst,
Register X, unsigned Flags) const {
LLT Ty = B.getMRI()->getType(Dst);
LLT F32 = LLT::scalar(32);
if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
- auto Log2E = B.buildFConstant(Ty, numbers::log2e);
- auto Mul = B.buildFMul(Ty, X, Log2E, Flags);
-
- if (Ty == F32) {
- B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
- .addUse(Mul.getReg(0))
- .setMIFlags(Flags);
- } else {
- B.buildFExp2(Dst, Mul.getReg(0), Flags);
- }
-
- return true;
+ return legalizeFExpUnsafeImpl(B, Dst, X, Flags, /*IsExp10=*/false);
}
auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
@@ -3739,6 +3805,55 @@ bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst,
return true;
}
+bool AMDGPULegalizerInfo::legalizeFExp10Unsafe(MachineIRBuilder &B,
+ Register Dst, Register X,
+ unsigned Flags) const {
+ LLT Ty = B.getMRI()->getType(Dst);
+ LLT F32 = LLT::scalar(32);
+
+ if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
+ // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
+ auto K0 = B.buildFConstant(Ty, 0x1.a92000p+1f);
+ auto K1 = B.buildFConstant(Ty, 0x1.4f0978p-11f);
+
+ auto Mul1 = B.buildFMul(Ty, X, K1, Flags);
+ auto Exp2_1 = buildExp(B, Ty, Mul1, Flags);
+ auto Mul0 = B.buildFMul(Ty, X, K0, Flags);
+ auto Exp2_0 = buildExp(B, Ty, Mul0, Flags);
+ B.buildFMul(Dst, Exp2_0, Exp2_1, Flags);
+ return true;
+ }
+
+ // bool s = x < -0x1.2f7030p+5f;
+ // x += s ? 0x1.0p+5f : 0.0f;
+ // exp10 = exp2(x * 0x1.a92000p+1f) *
+ // exp2(x * 0x1.4f0978p-11f) *
+ // (s ? 0x1.9f623ep-107f : 1.0f);
+
+ auto Threshold = B.buildFConstant(Ty, -0x1.2f7030p+5f);
+ auto NeedsScaling =
+ B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold);
+
+ auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+5f);
+ auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
+ auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X);
+
+ auto K0 = B.buildFConstant(Ty, 0x1.a92000p+1f);
+ auto K1 = B.buildFConstant(Ty, 0x1.4f0978p-11f);
+
+ auto Mul1 = B.buildFMul(Ty, AdjustedX, K1, Flags);
+ auto Exp2_1 = buildExp(B, Ty, Mul1, Flags);
+ auto Mul0 = B.buildFMul(Ty, AdjustedX, K0, Flags);
+ auto Exp2_0 = buildExp(B, Ty, Mul0, Flags);
+
+ auto MulExps = B.buildFMul(Ty, Exp2_0, Exp2_1, Flags);
+ auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.9f623ep-107f);
+ auto AdjustedResult = B.buildFMul(Ty, MulExps, ResultScaleFactor, Flags);
+
+ B.buildSelect(Dst, NeedsScaling, AdjustedResult, MulExps);
+ return true;
+}
+
bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
MachineIRBuilder &B) const {
Register Dst = MI.getOperand(0).getReg();
@@ -3755,18 +3870,22 @@ bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
// v_exp_f16 (fmul x, log2e)
if (allowApproxFunc(MF, Flags)) {
// TODO: Does this really require fast?
- legalizeFExpUnsafe(B, Dst, X, Flags);
+ IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags)
+ : legalizeFExpUnsafe(B, Dst, X, Flags);
MI.eraseFromParent();
return true;
}
+ // Nothing in half is a denormal when promoted to f32.
+ //
// exp(f16 x) ->
// fptrunc (v_exp_f32 (fmul (fpext x), log2e))
-
- // Nothing in half is a denormal when promoted to f32.
+ //
+ // exp10(f16 x) ->
+ // fptrunc (v_exp_f32 (fmul (fpext x), log2(10)))
auto Ext = B.buildFPExt(F32, X, Flags);
Register Lowered = MRI.createGenericVirtualRegister(F32);
- legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags);
+ legalizeFExpUnsafeImpl(B, Lowered, Ext.getReg(0), Flags, IsExp10);
B.buildFPTrunc(Dst, Lowered, Flags);
MI.eraseFromParent();
return true;
@@ -3777,7 +3896,8 @@ bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
// TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
// library behavior. Also, is known-not-daz source sufficient?
if (allowApproxFunc(MF, Flags)) {
- legalizeFExpUnsafe(B, Dst, X, Flags);
+ IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags)
+ : legalizeFExpUnsafe(B, Dst, X, Flags);
MI.eraseFromParent();
return true;
}
@@ -4702,6 +4822,14 @@ bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
return true;
}
+MachinePointerInfo
+AMDGPULegalizerInfo::getKernargSegmentPtrInfo(MachineFunction &MF) const {
+ // This isn't really a constant pool but close enough.
+ MachinePointerInfo PtrInfo(MF.getPSVManager().getConstantPool());
+ PtrInfo.AddrSpace = AMDGPUAS::CONSTANT_ADDRESS;
+ return PtrInfo;
+}
+
Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
int64_t Offset) const {
LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
@@ -4729,8 +4857,8 @@ bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
"unexpected kernarg parameter type");
Register Ptr = getKernargParameterPtr(B, Offset);
- MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
- B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
+ MachinePointerInfo PtrInfo = getKernargSegmentPtrInfo(B.getMF());
+ B.buildLoad(DstReg, Ptr, PtrInfo.getWithOffset(Offset), Align(4),
MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant);
MI.eraseFromParent();
@@ -6042,7 +6170,7 @@ AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
// On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
// being added, so we can only safely match a 32-bit addition with no unsigned
// overflow.
- bool CheckNUW = AMDGPU::isGFX1250(ST);
+ bool CheckNUW = ST.hasGFX1250Insts();
std::tie(BaseReg, ImmOffset) = AMDGPU::getBaseWithConstantOffset(
MRI, OrigOffset, /*KnownBits=*/nullptr, CheckNUW);
@@ -6531,8 +6659,15 @@ static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
+ case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
+ case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
+ return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32;
case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
default:
llvm_unreachable("unhandled atomic opcode");
@@ -6766,7 +6901,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
}
Observer.changingInstr(MI);
- auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
+ scope_exit ChangedInstr([&] { Observer.changedInstr(MI); });
const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
: AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
@@ -7194,7 +7329,7 @@ bool AMDGPULegalizerInfo::legalizeSBufferPrefetch(LegalizerHelper &Helper,
bool AMDGPULegalizerInfo::legalizeTrap(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
- if (!ST.isTrapHandlerEnabled() ||
+ if (!ST.hasTrapHandler() ||
ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
return legalizeTrapEndpgm(MI, MRI, B);
@@ -7253,9 +7388,9 @@ bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
return false;
// TODO: can we be smarter about machine pointer info?
- MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
+ MachinePointerInfo PtrInfo = getKernargSegmentPtrInfo(MF);
MachineMemOperand *MMO = MF.getMachineMemOperand(
- PtrInfo,
+ PtrInfo.getWithOffset(Offset),
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant,
LLT::scalar(64), commonAlignment(Align(64), Offset));
@@ -7314,7 +7449,7 @@ bool AMDGPULegalizerInfo::legalizeDebugTrap(MachineInstr &MI,
MachineIRBuilder &B) const {
// Is non-HSA path or trap-handler disabled? Then, report a warning
// accordingly
- if (!ST.isTrapHandlerEnabled() ||
+ if (!ST.hasTrapHandler() ||
ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
Function &Fn = B.getMF().getFunction();
Fn.getContext().diagnose(DiagnosticInfoUnsupported(
@@ -7630,6 +7765,24 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
// Replace the use G_BRCOND with the exec manipulate and branch pseudos.
auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
switch (IntrID) {
+ case Intrinsic::sponentry:
+ if (B.getMF().getInfo<SIMachineFunctionInfo>()->isBottomOfStack()) {
+ // FIXME: The imported pattern checks for i32 instead of p5; if we fix
+ // that we can remove this cast.
+ const LLT S32 = LLT::scalar(32);
+ Register TmpReg = MRI.createGenericVirtualRegister(S32);
+ B.buildInstr(AMDGPU::G_AMDGPU_SPONENTRY).addDef(TmpReg);
+
+ Register DstReg = MI.getOperand(0).getReg();
+ B.buildIntToPtr(DstReg, TmpReg);
+ MI.eraseFromParent();
+ } else {
+ int FI = B.getMF().getFrameInfo().CreateFixedObject(
+ 1, 0, /*IsImmutable=*/false);
+ B.buildFrameIndex(MI.getOperand(0), FI);
+ MI.eraseFromParent();
+ }
+ return true;
case Intrinsic::amdgcn_if:
case Intrinsic::amdgcn_else: {
MachineInstr *Br = nullptr;
@@ -7717,7 +7870,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_make_buffer_rsrc:
return legalizePointerAsRsrcIntrin(MI, MRI, B);
case Intrinsic::amdgcn_kernarg_segment_ptr:
- if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
+ if (!AMDGPU::isKernel(B.getMF().getFunction())) {
// This only makes sense to call in a kernel, so just lower to null.
B.buildConstant(MI.getOperand(0).getReg(), 0);
MI.eraseFromParent();
@@ -7940,6 +8093,14 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
+ case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
+ case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
+ case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
+ case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
@@ -8043,6 +8204,26 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
B.buildStore(MI.getOperand(2), MI.getOperand(1), **MI.memoperands_begin());
MI.eraseFromParent();
return true;
+ case Intrinsic::amdgcn_flat_load_monitor_b32:
+ case Intrinsic::amdgcn_flat_load_monitor_b64:
+ case Intrinsic::amdgcn_flat_load_monitor_b128:
+ assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
+ B.buildInstr(AMDGPU::G_AMDGPU_FLAT_LOAD_MONITOR)
+ .add(MI.getOperand(0))
+ .add(MI.getOperand(2))
+ .addMemOperand(*MI.memoperands_begin());
+ MI.eraseFromParent();
+ return true;
+ case Intrinsic::amdgcn_global_load_monitor_b32:
+ case Intrinsic::amdgcn_global_load_monitor_b64:
+ case Intrinsic::amdgcn_global_load_monitor_b128:
+ assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
+ B.buildInstr(AMDGPU::G_AMDGPU_GLOBAL_LOAD_MONITOR)
+ .add(MI.getOperand(0))
+ .add(MI.getOperand(2))
+ .addMemOperand(*MI.memoperands_begin());
+ MI.eraseFromParent();
+ return true;
default: {
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrID))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index cd44a9b..1224ee7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -91,8 +91,12 @@ public:
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src,
bool IsLog10, unsigned Flags) const;
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const;
+ bool legalizeFExpUnsafeImpl(MachineIRBuilder &B, Register Dst, Register Src,
+ unsigned Flags, bool IsExp10) const;
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src,
unsigned Flags) const;
+ bool legalizeFExp10Unsafe(MachineIRBuilder &B, Register Dst, Register Src,
+ unsigned Flags) const;
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const;
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const;
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI,
@@ -132,6 +136,7 @@ public:
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const;
+ MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const;
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const;
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B,
uint64_t Offset,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index aa75534..4de9349 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -47,9 +47,7 @@ namespace llvm {
class AMDGPULibCalls {
private:
- const TargetLibraryInfo *TLInfo = nullptr;
- AssumptionCache *AC = nullptr;
- DominatorTree *DT = nullptr;
+ SimplifyQuery SQ;
using FuncInfo = llvm::AMDGPULibFunc;
@@ -129,11 +127,10 @@ protected:
}
public:
- AMDGPULibCalls() = default;
+ AMDGPULibCalls(Function &F, FunctionAnalysisManager &FAM);
bool fold(CallInst *CI);
- void initFunction(Function &F, FunctionAnalysisManager &FAM);
void initNativeFuncs();
// Replace a normal math function call with that native version
@@ -422,11 +419,11 @@ bool AMDGPULibCalls::canIncreasePrecisionOfConstantFold(
return FPOp->isFast();
}
-void AMDGPULibCalls::initFunction(Function &F, FunctionAnalysisManager &FAM) {
- AC = &FAM.getResult<AssumptionAnalysis>(F);
- TLInfo = &FAM.getResult<TargetLibraryAnalysis>(F);
- DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
-}
+AMDGPULibCalls::AMDGPULibCalls(Function &F, FunctionAnalysisManager &FAM)
+ : SQ(F.getParent()->getDataLayout(),
+ &FAM.getResult<TargetLibraryAnalysis>(F),
+ FAM.getCachedResult<DominatorTreeAnalysis>(F),
+ &FAM.getResult<AssumptionAnalysis>(F)) {}
bool AMDGPULibCalls::useNativeFunc(const StringRef F) const {
return AllNative || llvm::is_contained(UseNative, F);
@@ -563,74 +560,6 @@ bool AMDGPULibCalls::fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
return true;
}
-static bool isKnownIntegral(const Value *V, const DataLayout &DL,
- FastMathFlags FMF) {
- if (isa<PoisonValue>(V))
- return true;
- if (isa<UndefValue>(V))
- return false;
-
- if (const ConstantFP *CF = dyn_cast<ConstantFP>(V))
- return CF->getValueAPF().isInteger();
-
- auto *VFVTy = dyn_cast<FixedVectorType>(V->getType());
- const Constant *CV = dyn_cast<Constant>(V);
- if (VFVTy && CV) {
- unsigned NumElts = VFVTy->getNumElements();
- for (unsigned i = 0; i != NumElts; ++i) {
- Constant *Elt = CV->getAggregateElement(i);
- if (!Elt)
- return false;
- if (isa<PoisonValue>(Elt))
- continue;
-
- const ConstantFP *CFP = dyn_cast<ConstantFP>(Elt);
- if (!CFP || !CFP->getValue().isInteger())
- return false;
- }
-
- return true;
- }
-
- const Instruction *I = dyn_cast<Instruction>(V);
- if (!I)
- return false;
-
- switch (I->getOpcode()) {
- case Instruction::SIToFP:
- case Instruction::UIToFP:
- // TODO: Could check nofpclass(inf) on incoming argument
- if (FMF.noInfs())
- return true;
-
- // Need to check int size cannot produce infinity, which computeKnownFPClass
- // knows how to do already.
- return isKnownNeverInfinity(I, SimplifyQuery(DL));
- case Instruction::Call: {
- const CallInst *CI = cast<CallInst>(I);
- switch (CI->getIntrinsicID()) {
- case Intrinsic::trunc:
- case Intrinsic::floor:
- case Intrinsic::ceil:
- case Intrinsic::rint:
- case Intrinsic::nearbyint:
- case Intrinsic::round:
- case Intrinsic::roundeven:
- return (FMF.noInfs() && FMF.noNaNs()) ||
- isKnownNeverInfOrNaN(I, SimplifyQuery(DL));
- default:
- break;
- }
-
- break;
- }
- default:
- break;
- }
-
- return false;
-}
-
// This function returns false if no change; return true otherwise.
bool AMDGPULibCalls::fold(CallInst *CI) {
Function *Callee = CI->getCalledFunction();
@@ -753,16 +682,14 @@ bool AMDGPULibCalls::fold(CallInst *CI) {
// pow(x, y) -> powr(x, y) for x >= -0.0
// TODO: Account for flags on current call
- if (PowrFunc &&
- cannotBeOrderedLessThanZero(
- FPOp->getOperand(0),
- SimplifyQuery(M->getDataLayout(), TLInfo, DT, AC, Call))) {
+ if (PowrFunc && cannotBeOrderedLessThanZero(
+ FPOp->getOperand(0), SQ.getWithInstruction(Call))) {
Call->setCalledFunction(PowrFunc);
return fold_pow(FPOp, B, PowrInfo) || true;
}
// pow(x, y) -> pown(x, y) for known integral y
- if (isKnownIntegral(FPOp->getOperand(1), M->getDataLayout(),
+ if (isKnownIntegral(FPOp->getOperand(1), SQ.getWithInstruction(CI),
FPOp->getFastMathFlags())) {
FunctionType *PownType = getPownType(CI->getFunctionType());
AMDGPULibFunc PownInfo(AMDGPULibFunc::EI_POWN, PownType, true);
@@ -845,7 +772,7 @@ bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) {
return false;
}
}
- LLVMContext &context = CI->getParent()->getParent()->getContext();
+ LLVMContext &context = CI->getContext();
Constant *nval;
if (getArgType(FInfo) == AMDGPULibFunc::F32) {
SmallVector<float, 0> FVal;
@@ -1084,7 +1011,8 @@ bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B,
if (needcopysign && (FInfo.getId() == AMDGPULibFunc::EI_POW)) {
// We cannot handle corner cases for a general pow() function, give up
// unless y is a constant integral value. Then proceed as if it were pown.
- if (!isKnownIntegral(opr1, M->getDataLayout(), FPOp->getFastMathFlags()))
+ if (!isKnownIntegral(opr1, SQ.getWithInstruction(cast<Instruction>(FPOp)),
+ FPOp->getFastMathFlags()))
return false;
}
@@ -1113,22 +1041,33 @@ bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B,
opr1 = B.CreateSIToFP(opr1, nval->getType(), "pownI2F");
}
nval = B.CreateFMul(opr1, nval, "__ylogx");
- nval = CreateCallEx(B,ExpExpr, nval, "__exp2");
+
+ CallInst *Exp2Call = CreateCallEx(B, ExpExpr, nval, "__exp2");
+
+ // TODO: Generalized fpclass logic for pow
+ FPClassTest KnownNot = FPClassTest::fcNegative;
+ if (FPOp->hasNoNaNs())
+ KnownNot |= FPClassTest::fcNan;
+
+ Exp2Call->addRetAttr(
+ Attribute::getWithNoFPClass(Exp2Call->getContext(), KnownNot));
+ nval = Exp2Call;
if (needcopysign) {
Type* nTyS = B.getIntNTy(eltType->getPrimitiveSizeInBits());
Type *nTy = FPOp->getType()->getWithNewType(nTyS);
- unsigned size = nTy->getScalarSizeInBits();
Value *opr_n = FPOp->getOperand(1);
if (opr_n->getType()->getScalarType()->isIntegerTy())
opr_n = B.CreateZExtOrTrunc(opr_n, nTy, "__ytou");
else
opr_n = B.CreateFPToSI(opr1, nTy, "__ytou");
+ unsigned size = nTy->getScalarSizeInBits();
Value *sign = B.CreateShl(opr_n, size-1, "__yeven");
sign = B.CreateAnd(B.CreateBitCast(opr0, nTy), sign, "__pow_sign");
- nval = B.CreateOr(B.CreateBitCast(nval, nTy), sign);
- nval = B.CreateBitCast(nval, opr0->getType());
+
+ nval = B.CreateCopySign(nval, B.CreateBitCast(sign, nval->getType()),
+ nullptr, "__pow_sign");
}
LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> "
@@ -1333,7 +1272,7 @@ AMDGPULibCalls::insertSinCos(Value *Arg, FastMathFlags FMF, IRBuilder<> &B,
// TODO: Is it worth trying to preserve the location for the cos calls for the
// load?
- LoadInst *LoadCos = B.CreateLoad(Alloc->getAllocatedType(), Alloc);
+ LoadInst *LoadCos = B.CreateLoad(Arg->getType(), Alloc);
return {SinCos, LoadCos, SinCos};
}
@@ -1699,9 +1638,8 @@ bool AMDGPULibCalls::evaluateCall(CallInst *aCI, const FuncInfo &FInfo) {
PreservedAnalyses AMDGPUSimplifyLibCallsPass::run(Function &F,
FunctionAnalysisManager &AM) {
- AMDGPULibCalls Simplifier;
+ AMDGPULibCalls Simplifier(F, AM);
Simplifier.initNativeFuncs();
- Simplifier.initFunction(F, AM);
bool Changed = false;
@@ -1728,9 +1666,8 @@ PreservedAnalyses AMDGPUUseNativeCallsPass::run(Function &F,
if (UseNative.empty())
return PreservedAnalyses::all();
- AMDGPULibCalls Simplifier;
+ AMDGPULibCalls Simplifier(F, AM);
Simplifier.initNativeFuncs();
- Simplifier.initFunction(F, AM);
bool Changed = false;
for (auto &BB : F) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 0a59132..05e97d2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -635,7 +635,8 @@ bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemSetInst(
MemSetInst &MSI) {
if (MSI.getDestAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER)
return false;
- llvm::expandMemSetAsLoop(&MSI);
+ llvm::expandMemSetAsLoop(&MSI,
+ TM->getTargetTransformInfo(*MSI.getFunction()));
MSI.eraseFromParent();
return true;
}
@@ -1565,8 +1566,11 @@ void SplitPtrStructs::processConditionals() {
} else if (isa<SelectInst>(I)) {
if (MaybeRsrc) {
if (auto *RsrcInst = dyn_cast<Instruction>(Rsrc)) {
- ConditionalTemps.push_back(RsrcInst);
- RsrcInst->replaceAllUsesWith(*MaybeRsrc);
+ // Guard against conditionals that were already folded away.
+ if (RsrcInst != *MaybeRsrc) {
+ ConditionalTemps.push_back(RsrcInst);
+ RsrcInst->replaceAllUsesWith(*MaybeRsrc);
+ }
}
for (Value *V : Seen)
FoundRsrcs[V] = *MaybeRsrc;
@@ -1745,6 +1749,12 @@ Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr,
case AtomicRMWInst::FMin:
IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin;
break;
+ case AtomicRMWInst::USubCond:
+ IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32;
+ break;
+ case AtomicRMWInst::USubSat:
+ IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32;
+ break;
case AtomicRMWInst::FSub: {
reportFatalUsageError(
"atomic floating point subtraction not supported for "
@@ -1770,14 +1780,12 @@ Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr,
break;
case AtomicRMWInst::UIncWrap:
case AtomicRMWInst::UDecWrap:
- reportFatalUsageError("wrapping increment/decrement not supported for "
- "buffer resources and should've ben expanded away");
+ reportFatalUsageError(
+ "wrapping increment/decrement not supported for "
+ "buffer resources and should've been expanded away");
break;
case AtomicRMWInst::BAD_BINOP:
llvm_unreachable("Not sure how we got a bad binop");
- case AtomicRMWInst::USubCond:
- case AtomicRMWInst::USubSat:
- break;
}
}
@@ -2059,17 +2067,7 @@ PtrParts SplitPtrStructs::visitICmpInst(ICmpInst &Cmp) {
"Pointer comparison is only equal or unequal");
auto [LhsRsrc, LhsOff] = getPtrParts(Lhs);
auto [RhsRsrc, RhsOff] = getPtrParts(Rhs);
- Value *RsrcCmp =
- IRB.CreateICmp(Pred, LhsRsrc, RhsRsrc, Cmp.getName() + ".rsrc");
- copyMetadata(RsrcCmp, &Cmp);
- Value *OffCmp = IRB.CreateICmp(Pred, LhsOff, RhsOff, Cmp.getName() + ".off");
- copyMetadata(OffCmp, &Cmp);
-
- Value *Res = nullptr;
- if (Pred == ICmpInst::ICMP_EQ)
- Res = IRB.CreateAnd(RsrcCmp, OffCmp);
- else if (Pred == ICmpInst::ICMP_NE)
- Res = IRB.CreateOr(RsrcCmp, OffCmp);
+ Value *Res = IRB.CreateICmp(Pred, LhsOff, RhsOff);
copyMetadata(Res, &Cmp);
Res->takeName(&Cmp);
SplitUsers.insert(&Cmp);
@@ -2210,6 +2208,7 @@ static bool isRemovablePointerIntrinsic(Intrinsic::ID IID) {
case Intrinsic::memset_inline:
case Intrinsic::experimental_memset_pattern:
case Intrinsic::amdgcn_load_to_lds:
+ case Intrinsic::amdgcn_load_async_to_lds:
return true;
}
}
@@ -2298,7 +2297,8 @@ PtrParts SplitPtrStructs::visitIntrinsicInst(IntrinsicInst &I) {
SplitUsers.insert(&I);
return {NewRsrc, Off};
}
- case Intrinsic::amdgcn_load_to_lds: {
+ case Intrinsic::amdgcn_load_to_lds:
+ case Intrinsic::amdgcn_load_async_to_lds: {
Value *Ptr = I.getArgOperand(0);
if (!isSplitFatPtr(Ptr->getType()))
return {nullptr, nullptr};
@@ -2309,9 +2309,12 @@ PtrParts SplitPtrStructs::visitIntrinsicInst(IntrinsicInst &I) {
Value *ImmOff = I.getArgOperand(3);
Value *Aux = I.getArgOperand(4);
Value *SOffset = IRB.getInt32(0);
+ Intrinsic::ID NewIntr =
+ IID == Intrinsic::amdgcn_load_to_lds
+ ? Intrinsic::amdgcn_raw_ptr_buffer_load_lds
+ : Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds;
Instruction *NewLoad = IRB.CreateIntrinsic(
- Intrinsic::amdgcn_raw_ptr_buffer_load_lds, {},
- {Rsrc, LDSPtr, LoadSize, Off, SOffset, ImmOff, Aux});
+ NewIntr, {}, {Rsrc, LDSPtr, LoadSize, Off, SOffset, ImmOff, Aux});
copyMetadata(NewLoad, &I);
SplitUsers.insert(&I);
I.replaceAllUsesWith(NewLoad);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp
new file mode 100644
index 0000000..c26e973
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp
@@ -0,0 +1,240 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Lower LDS global variables with target extension type "amdgpu.named.barrier"
+// that require specialized address assignment. It assigns a unique
+// barrier identifier to each named-barrier LDS variable and encodes
+// this identifier within the !absolute_symbol metadata of that global.
+// This encoding ensures that subsequent LDS lowering passes can process these
+// barriers correctly without conflicts.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUMemoryUtils.h"
+#include "AMDGPUTargetMachine.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/ReplaceConstant.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+
+#include <algorithm>
+
+#define DEBUG_TYPE "amdgpu-lower-exec-sync"
+
+using namespace llvm;
+using namespace AMDGPU;
+
+namespace {
+
+// If GV is also used directly by other kernels, create a new GV
+// used only by this kernel and its function.
+static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
+ Function *KF) {
+ bool NeedsReplacement = false;
+ for (Use &U : GV->uses()) {
+ if (auto *I = dyn_cast<Instruction>(U.getUser())) {
+ Function *F = I->getFunction();
+ if (isKernel(*F) && F != KF) {
+ NeedsReplacement = true;
+ break;
+ }
+ }
+ }
+ if (!NeedsReplacement)
+ return GV;
+ // Create a new GV used only by this kernel and its function
+ GlobalVariable *NewGV = new GlobalVariable(
+ M, GV->getValueType(), GV->isConstant(), GV->getLinkage(),
+ GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr,
+ GV->getThreadLocalMode(), GV->getType()->getAddressSpace());
+ NewGV->copyAttributesFrom(GV);
+ for (Use &U : make_early_inc_range(GV->uses())) {
+ if (auto *I = dyn_cast<Instruction>(U.getUser())) {
+ Function *F = I->getFunction();
+ if (!isKernel(*F) || F == KF) {
+ U.getUser()->replaceUsesOfWith(GV, NewGV);
+ }
+ }
+ }
+ return NewGV;
+}
+
+// Write the specified address into metadata where it can be retrieved by
+// the assembler. Format is a half open range, [Address Address+1)
+static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV,
+ uint32_t Address) {
+ LLVMContext &Ctx = M->getContext();
+ auto *IntTy = M->getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
+ auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address));
+ auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address + 1));
+ GV->setMetadata(LLVMContext::MD_absolute_symbol,
+ MDNode::get(Ctx, {MinC, MaxC}));
+}
+
+template <typename T> SmallVector<T> sortByName(SmallVector<T> &&V) {
+ sort(V, [](const auto *L, const auto *R) {
+ return L->getName() < R->getName();
+ });
+ return {std::move(V)};
+}
+
+// Main utility function for special LDS variables lowering.
+static bool lowerExecSyncGlobalVariables(
+ Module &M, LDSUsesInfoTy &LDSUsesInfo,
+ VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
+ bool Changed = false;
+ const DataLayout &DL = M.getDataLayout();
+ // The 1st round: give module-absolute assignments
+ int NumAbsolutes = 0;
+ SmallVector<GlobalVariable *> OrderedGVs;
+ for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
+ GlobalVariable *GV = K.first;
+ if (!isNamedBarrier(*GV))
+ continue;
+ // give a module-absolute assignment if it is indirectly accessed by
+ // multiple kernels. This is not precise, but we don't want to duplicate
+ // a function when it is called by multiple kernels.
+ if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) {
+ OrderedGVs.push_back(GV);
+ } else {
+ // leave it to the 2nd round, which will give a kernel-relative
+ // assignment if it is only indirectly accessed by one kernel
+ LDSUsesInfo.direct_access[*K.second.begin()].insert(GV);
+ }
+ LDSToKernelsThatNeedToAccessItIndirectly.erase(GV);
+ }
+ OrderedGVs = sortByName(std::move(OrderedGVs));
+ for (GlobalVariable *GV : OrderedGVs) {
+ unsigned BarrierScope = AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
+ unsigned BarId = NumAbsolutes + 1;
+ unsigned BarCnt = GV->getGlobalSize(DL) / 16;
+ NumAbsolutes += BarCnt;
+
+ // 4 bits for alignment, 5 bits for the barrier num,
+ // 3 bits for the barrier scope
+ unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
+ recordLDSAbsoluteAddress(&M, GV, Offset);
+ }
+ OrderedGVs.clear();
+
+ // The 2nd round: give a kernel-relative assignment for GV that
+ // either only indirectly accessed by single kernel or only directly
+ // accessed by multiple kernels.
+ SmallVector<Function *> OrderedKernels;
+ for (auto &K : LDSUsesInfo.direct_access) {
+ Function *F = K.first;
+ assert(isKernel(*F));
+ OrderedKernels.push_back(F);
+ }
+ OrderedKernels = sortByName(std::move(OrderedKernels));
+
+ DenseMap<Function *, uint32_t> Kernel2BarId;
+ for (Function *F : OrderedKernels) {
+ for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) {
+ if (!isNamedBarrier(*GV))
+ continue;
+
+ LDSUsesInfo.direct_access[F].erase(GV);
+ if (GV->isAbsoluteSymbolRef()) {
+ // already assigned
+ continue;
+ }
+ OrderedGVs.push_back(GV);
+ }
+ OrderedGVs = sortByName(std::move(OrderedGVs));
+ for (GlobalVariable *GV : OrderedGVs) {
+ // GV could also be used directly by other kernels. If so, we need to
+ // create a new GV used only by this kernel and its function.
+ auto NewGV = uniquifyGVPerKernel(M, GV, F);
+ Changed |= (NewGV != GV);
+ unsigned BarrierScope = AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
+ unsigned BarId = Kernel2BarId[F];
+ BarId += NumAbsolutes + 1;
+ unsigned BarCnt = GV->getGlobalSize(DL) / 16;
+ Kernel2BarId[F] += BarCnt;
+ unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
+ recordLDSAbsoluteAddress(&M, NewGV, Offset);
+ }
+ OrderedGVs.clear();
+ }
+ // Also erase those special LDS variables from indirect_access.
+ for (auto &K : LDSUsesInfo.indirect_access) {
+ assert(isKernel(*K.first));
+ for (GlobalVariable *GV : K.second) {
+ if (isNamedBarrier(*GV))
+ K.second.erase(GV);
+ }
+ }
+ return Changed;
+}
+
+static bool runLowerExecSyncGlobals(Module &M) {
+ CallGraph CG = CallGraph(M);
+ bool Changed = false;
+ Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M);
+
+ // For each kernel, what variables does it access directly or through
+ // callees
+ LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M);
+
+ // For each variable accessed through callees, which kernels access it
+ VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly;
+ for (auto &K : LDSUsesInfo.indirect_access) {
+ Function *F = K.first;
+ assert(isKernel(*F));
+ for (GlobalVariable *GV : K.second) {
+ LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(F);
+ }
+ }
+
+ if (LDSUsesInfo.HasSpecialGVs) {
+ // Special LDS variables need special address assignment
+ Changed |= lowerExecSyncGlobalVariables(
+ M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly);
+ }
+ return Changed;
+}
+
+class AMDGPULowerExecSyncLegacy : public ModulePass {
+public:
+ static char ID;
+ AMDGPULowerExecSyncLegacy() : ModulePass(ID) {}
+ bool runOnModule(Module &M) override;
+};
+
+} // namespace
+
+char AMDGPULowerExecSyncLegacy::ID = 0;
+char &llvm::AMDGPULowerExecSyncLegacyPassID = AMDGPULowerExecSyncLegacy::ID;
+
+INITIALIZE_PASS_BEGIN(AMDGPULowerExecSyncLegacy, DEBUG_TYPE,
+ "AMDGPU lowering of execution synchronization", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(AMDGPULowerExecSyncLegacy, DEBUG_TYPE,
+ "AMDGPU lowering of execution synchronization", false,
+ false)
+
+bool AMDGPULowerExecSyncLegacy::runOnModule(Module &M) {
+ return runLowerExecSyncGlobals(M);
+}
+
+ModulePass *llvm::createAMDGPULowerExecSyncLegacyPass() {
+ return new AMDGPULowerExecSyncLegacy();
+}
+
+PreservedAnalyses AMDGPULowerExecSyncPass::run(Module &M,
+ ModuleAnalysisManager &AM) {
+ return runLowerExecSyncGlobals(M) ? PreservedAnalyses::none()
+ : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index dec781d..f93b0b4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -12,14 +12,26 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
+#include "AMDGPUAsanInstrumentation.h"
#include "GCNSubtarget.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/ScopedNoAliasAA.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Argument.h"
#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/Target/TargetMachine.h"
+#include <optional>
+#include <string>
#define DEBUG_TYPE "amdgpu-lower-kernel-arguments"
@@ -37,6 +49,7 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<TargetPassConfig>();
+ AU.addRequired<DominatorTreeWrapperPass>();
AU.setPreservesAll();
}
};
@@ -58,13 +71,131 @@ static BasicBlock::iterator getInsertPt(BasicBlock &BB) {
return InsPt;
}
-static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
+static void addAliasScopeMetadata(Function &F, const DataLayout &DL,
+ DominatorTree &DT) {
+ // Collect noalias arguments.
+ SmallVector<const Argument *, 4u> NoAliasArgs;
+
+ for (Argument &Arg : F.args())
+ if (Arg.hasNoAliasAttr() && !Arg.use_empty())
+ NoAliasArgs.push_back(&Arg);
+
+ if (NoAliasArgs.empty())
+ return;
+
+ // Add alias scopes for each noalias argument.
+ MDBuilder MDB(F.getContext());
+ DenseMap<const Argument *, MDNode *> NewScopes;
+ MDNode *NewDomain = MDB.createAnonymousAliasScopeDomain(F.getName());
+
+ for (unsigned I = 0u; I < NoAliasArgs.size(); ++I) {
+ const Argument *Arg = NoAliasArgs[I];
+ MDNode *NewScope = MDB.createAnonymousAliasScope(NewDomain, Arg->getName());
+ NewScopes.insert({Arg, NewScope});
+ }
+
+ // Iterate over all instructions.
+ for (inst_iterator Inst = inst_begin(F), InstEnd = inst_end(F);
+ Inst != InstEnd; ++Inst) {
+ // If instruction accesses memory, collect its pointer arguments.
+ Instruction *I = &(*Inst);
+ SmallVector<const Value *, 2u> PtrArgs;
+
+ if (std::optional<MemoryLocation> MO = MemoryLocation::getOrNone(I))
+ PtrArgs.push_back(MO->Ptr);
+ else if (const CallBase *Call = dyn_cast<CallBase>(I)) {
+ if (Call->doesNotAccessMemory())
+ continue;
+
+ for (Value *Arg : Call->args()) {
+ if (!Arg->getType()->isPointerTy())
+ continue;
+
+ PtrArgs.push_back(Arg);
+ }
+ }
+
+ if (PtrArgs.empty())
+ continue;
+
+ // Collect underlying objects of pointer arguments.
+ SmallVector<Metadata *, 4u> Scopes;
+ SmallPtrSet<const Value *, 4u> ObjSet;
+ SmallVector<Metadata *, 4u> NoAliases;
+
+ for (const Value *Val : PtrArgs) {
+ SmallVector<const Value *, 4u> Objects;
+ getUnderlyingObjects(Val, Objects);
+ ObjSet.insert_range(Objects);
+ }
+
+ bool RequiresNoCaptureBefore = false;
+ bool UsesUnknownObject = false;
+ bool UsesAliasingPtr = false;
+
+ for (const Value *Val : ObjSet) {
+ if (isa<ConstantData>(Val))
+ continue;
+
+ if (const Argument *Arg = dyn_cast<Argument>(Val)) {
+ if (!Arg->hasAttribute(Attribute::NoAlias))
+ UsesAliasingPtr = true;
+ } else
+ UsesAliasingPtr = true;
+
+ if (isEscapeSource(Val))
+ RequiresNoCaptureBefore = true;
+ else if (!isa<Argument>(Val) && isIdentifiedObject(Val))
+ UsesUnknownObject = true;
+ }
+
+ if (UsesUnknownObject)
+ continue;
+
+ // Collect noalias scopes for instruction.
+ for (const Argument *Arg : NoAliasArgs) {
+ if (ObjSet.contains(Arg))
+ continue;
+
+ if (!RequiresNoCaptureBefore ||
+ !capturesAnything(PointerMayBeCapturedBefore(
+ Arg, false, I, &DT, false, CaptureComponents::Provenance)))
+ NoAliases.push_back(NewScopes[Arg]);
+ }
+
+ // Add noalias metadata to instruction.
+ if (!NoAliases.empty()) {
+ MDNode *NewMD =
+ MDNode::concatenate(Inst->getMetadata(LLVMContext::MD_noalias),
+ MDNode::get(F.getContext(), NoAliases));
+ Inst->setMetadata(LLVMContext::MD_noalias, NewMD);
+ }
+
+ // Collect scopes for alias.scope metadata.
+ if (!UsesAliasingPtr)
+ for (const Argument *Arg : NoAliasArgs) {
+ if (ObjSet.count(Arg))
+ Scopes.push_back(NewScopes[Arg]);
+ }
+
+ // Add alias.scope metadata to instruction.
+ if (!Scopes.empty()) {
+ MDNode *NewMD =
+ MDNode::concatenate(Inst->getMetadata(LLVMContext::MD_alias_scope),
+ MDNode::get(F.getContext(), Scopes));
+ Inst->setMetadata(LLVMContext::MD_alias_scope, NewMD);
+ }
+ }
+}
+
+static bool lowerKernelArguments(Function &F, const TargetMachine &TM,
+ DominatorTree &DT) {
CallingConv::ID CC = F.getCallingConv();
if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty())
return false;
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
- LLVMContext &Ctx = F.getParent()->getContext();
+ LLVMContext &Ctx = F.getContext();
const DataLayout &DL = F.getDataLayout();
BasicBlock &EntryBlock = *F.begin();
IRBuilder<> Builder(&EntryBlock, getInsertPt(EntryBlock));
@@ -86,6 +217,9 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize));
uint64_t ExplicitArgOffset = 0;
+
+ addAliasScopeMetadata(F, F.getParent()->getDataLayout(), DT);
+
for (Argument &Arg : F.args()) {
const bool IsByRef = Arg.hasByRefAttr();
Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
@@ -124,11 +258,6 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
PT->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) &&
!ST.hasUsableDSOffset())
continue;
-
- // FIXME: We can replace this with equivalent alias.scope/noalias
- // metadata, but this appears to be a lot of work.
- if (Arg.hasNoAliasAttr())
- continue;
}
auto *VT = dyn_cast<FixedVectorType>(ArgTy);
@@ -215,8 +344,6 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
}
}
- // TODO: Convert noalias arg to !noalias
-
if (DoShiftOpt) {
Value *ExtractBits = OffsetDiff == 0 ?
Load : Builder.CreateLShr(Load, OffsetDiff * 8);
@@ -245,7 +372,8 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
auto &TPC = getAnalysis<TargetPassConfig>();
const TargetMachine &TM = TPC.getTM<TargetMachine>();
- return lowerKernelArguments(F, TM);
+ DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ return lowerKernelArguments(F, TM, DT);
}
INITIALIZE_PASS_BEGIN(AMDGPULowerKernelArguments, DEBUG_TYPE,
@@ -261,7 +389,8 @@ FunctionPass *llvm::createAMDGPULowerKernelArgumentsPass() {
PreservedAnalyses
AMDGPULowerKernelArgumentsPass::run(Function &F, FunctionAnalysisManager &AM) {
- bool Changed = lowerKernelArguments(F, TM);
+ DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ bool Changed = lowerKernelArguments(F, TM, DT);
if (Changed) {
// TODO: Preserves a lot more.
PreservedAnalyses PA;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
index fed7a13..fbfb710 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
@@ -1,4 +1,4 @@
-//===-- AMDGPULowerKernelAttributes.cpp ------------------------------------------===//
+//===-- AMDGPULowerKernelAttributes.cpp------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -19,6 +19,7 @@
#include "llvm/CodeGen/Passes.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
@@ -66,13 +67,11 @@ public:
bool runOnModule(Module &M) override;
- StringRef getPassName() const override {
- return "AMDGPU Kernel Attributes";
- }
+ StringRef getPassName() const override { return "AMDGPU Kernel Attributes"; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesAll();
- }
+ }
};
Function *getBasePtrIntrinsic(Module &M, bool IsV5OrAbove) {
@@ -98,26 +97,28 @@ static void annotateGridSizeLoadWithRangeMD(LoadInst *Load,
}
static bool processUse(CallInst *CI, bool IsV5OrAbove) {
- Function *F = CI->getParent()->getParent();
+ Function *F = CI->getFunction();
auto *MD = F->getMetadata("reqd_work_group_size");
const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3;
const bool HasUniformWorkGroupSize =
- F->getFnAttribute("uniform-work-group-size").getValueAsBool();
+ F->getFnAttribute("uniform-work-group-size").getValueAsBool();
SmallVector<unsigned> MaxNumWorkgroups =
AMDGPU::getIntegerVecAttribute(*F, "amdgpu-max-num-workgroups",
/*Size=*/3, /*DefaultVal=*/0);
if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize &&
+ !Intrinsic::getDeclarationIfExists(CI->getModule(),
+ Intrinsic::amdgcn_dispatch_ptr) &&
none_of(MaxNumWorkgroups, [](unsigned X) { return X != 0; }))
return false;
Value *BlockCounts[3] = {nullptr, nullptr, nullptr};
- Value *GroupSizes[3] = {nullptr, nullptr, nullptr};
- Value *Remainders[3] = {nullptr, nullptr, nullptr};
- Value *GridSizes[3] = {nullptr, nullptr, nullptr};
+ Value *GroupSizes[3] = {nullptr, nullptr, nullptr};
+ Value *Remainders[3] = {nullptr, nullptr, nullptr};
+ Value *GridSizes[3] = {nullptr, nullptr, nullptr};
const DataLayout &DL = F->getDataLayout();
@@ -230,13 +231,15 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
bool MadeChange = false;
if (IsV5OrAbove && HasUniformWorkGroupSize) {
- // Under v5 __ockl_get_local_size returns the value computed by the expression:
+ // Under v5 __ockl_get_local_size returns the value computed by the
+ // expression:
//
- // workgroup_id < hidden_block_count ? hidden_group_size : hidden_remainder
+ // workgroup_id < hidden_block_count ? hidden_group_size :
+ // hidden_remainder
//
- // For functions with the attribute uniform-work-group-size=true. we can evaluate
- // workgroup_id < hidden_block_count as true, and thus hidden_group_size is returned
- // for __ockl_get_local_size.
+ // For functions with the attribute uniform-work-group-size=true. we can
+ // evaluate workgroup_id < hidden_block_count as true, and thus
+ // hidden_group_size is returned for __ockl_get_local_size.
for (int I = 0; I < 3; ++I) {
Value *BlockCount = BlockCounts[I];
if (!BlockCount)
@@ -261,7 +264,8 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
for (Value *Remainder : Remainders) {
if (!Remainder)
continue;
- Remainder->replaceAllUsesWith(Constant::getNullValue(Remainder->getType()));
+ Remainder->replaceAllUsesWith(
+ Constant::getNullValue(Remainder->getType()));
MadeChange = true;
}
} else if (HasUniformWorkGroupSize) { // Pre-V5.
@@ -302,13 +306,13 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
continue;
for (User *UMin : ZextGroupSize->users()) {
- if (match(UMin,
- m_UMin(m_Sub(m_Specific(GridSize),
- m_Mul(GroupIDIntrin, m_Specific(ZextGroupSize))),
- m_Specific(ZextGroupSize)))) {
+ if (match(UMin, m_UMin(m_Sub(m_Specific(GridSize),
+ m_Mul(GroupIDIntrin,
+ m_Specific(ZextGroupSize))),
+ m_Specific(ZextGroupSize)))) {
if (HasReqdWorkGroupSize) {
- ConstantInt *KnownSize
- = mdconst::extract<ConstantInt>(MD->getOperand(I));
+ ConstantInt *KnownSize =
+ mdconst::extract<ConstantInt>(MD->getOperand(I));
UMin->replaceAllUsesWith(ConstantFoldIntegerCast(
KnownSize, UMin->getType(), false, DL));
} else {
@@ -322,6 +326,49 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
}
}
+ // Upgrade the old method of calculating the block size using the grid size.
+ // We pattern match any case where the implicit argument group size is the
+ // divisor to a dispatch packet grid size read of the same dimension.
+ if (IsV5OrAbove) {
+ for (int I = 0; I < 3; I++) {
+ Value *GroupSize = GroupSizes[I];
+ if (!GroupSize || !GroupSize->getType()->isIntegerTy(16))
+ continue;
+
+ for (User *U : GroupSize->users()) {
+ Instruction *Inst = cast<Instruction>(U);
+ if (isa<ZExtInst>(Inst) && !Inst->use_empty())
+ Inst = cast<Instruction>(*Inst->user_begin());
+
+ using namespace llvm::PatternMatch;
+ if (!match(
+ Inst,
+ m_UDiv(m_ZExtOrSelf(m_Load(m_GEP(
+ m_Intrinsic<Intrinsic::amdgcn_dispatch_ptr>(),
+ m_SpecificInt(GRID_SIZE_X + I * sizeof(uint32_t))))),
+ m_Value())))
+ continue;
+
+ IRBuilder<> Builder(Inst);
+
+ Value *GEP = Builder.CreateInBoundsGEP(
+ Builder.getInt8Ty(), CI,
+ {ConstantInt::get(Type::getInt64Ty(CI->getContext()),
+ HIDDEN_BLOCK_COUNT_X + I * sizeof(uint32_t))});
+ Instruction *BlockCount = Builder.CreateLoad(Builder.getInt32Ty(), GEP);
+ BlockCount->setMetadata(LLVMContext::MD_invariant_load,
+ MDNode::get(CI->getContext(), {}));
+ BlockCount->setMetadata(LLVMContext::MD_noundef,
+ MDNode::get(CI->getContext(), {}));
+
+ Value *BlockCountExt = Builder.CreateZExt(BlockCount, Inst->getType());
+ Inst->replaceAllUsesWith(BlockCountExt);
+ Inst->eraseFromParent();
+ MadeChange = true;
+ }
+ }
+ }
+
// If reqd_work_group_size is set, we can replace work group size with it.
if (!HasReqdWorkGroupSize)
return MadeChange;
@@ -340,7 +387,6 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
return MadeChange;
}
-
// TODO: Move makeLIDRangeMetadata usage into here. Seem to not get
// TargetPassConfig for subtarget.
bool AMDGPULowerKernelAttributes::runOnModule(Module &M) {
@@ -364,7 +410,6 @@ bool AMDGPULowerKernelAttributes::runOnModule(Module &M) {
return MadeChange;
}
-
INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE,
"AMDGPU Kernel Attributes", false, false)
INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE,
@@ -385,12 +430,14 @@ AMDGPULowerKernelAttributesPass::run(Function &F, FunctionAnalysisManager &AM) {
if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used.
return PreservedAnalyses::all();
+ bool Changed = false;
for (Instruction &I : instructions(F)) {
if (CallInst *CI = dyn_cast<CallInst>(&I)) {
if (CI->getCalledFunction() == BasePtr)
- processUse(CI, IsV5OrAbove);
+ Changed |= processUse(CI, IsV5OrAbove);
}
}
- return PreservedAnalyses::all();
+ return !Changed ? PreservedAnalyses::all()
+ : PreservedAnalyses::none().preserveSet<CFGAnalyses>();
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index a4ef524..588eee0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -441,7 +441,7 @@ public:
return KernelSet;
for (Function &Func : M.functions()) {
- if (Func.isDeclaration() || !isKernelLDS(&Func))
+ if (Func.isDeclaration() || !isKernel(Func))
continue;
for (GlobalVariable *GV : LDSUsesInfo.indirect_access[&Func]) {
if (VariableSet.contains(GV)) {
@@ -501,9 +501,7 @@ public:
// strategy
continue;
}
- CandidateTy Candidate(
- GV, K.second.size(),
- DL.getTypeAllocSize(GV->getValueType()).getFixedValue());
+ CandidateTy Candidate(GV, K.second.size(), GV->getGlobalSize(DL));
if (MostUsed < Candidate)
MostUsed = Candidate;
}
@@ -555,7 +553,7 @@ public:
for (Function &Func : M->functions()) {
if (Func.isDeclaration())
continue;
- if (!isKernelLDS(&Func))
+ if (!isKernel(Func))
continue;
if (KernelsThatAllocateTableLDS.contains(&Func) ||
@@ -703,7 +701,7 @@ public:
return false;
}
Function *F = I->getFunction();
- return !isKernelLDS(F);
+ return !isKernel(*F);
});
// Replace uses of module scope variable from kernel functions that
@@ -711,7 +709,7 @@ public:
// Record on each kernel whether the module scope global is used by it
for (Function &Func : M.functions()) {
- if (Func.isDeclaration() || !isKernelLDS(&Func))
+ if (Func.isDeclaration() || !isKernel(Func))
continue;
if (KernelsThatAllocateModuleLDS.contains(&Func)) {
@@ -743,7 +741,7 @@ public:
DenseMap<Function *, LDSVariableReplacement> KernelToReplacement;
for (Function &Func : M.functions()) {
- if (Func.isDeclaration() || !isKernelLDS(&Func))
+ if (Func.isDeclaration() || !isKernel(Func))
continue;
DenseSet<GlobalVariable *> KernelUsedVariables;
@@ -828,7 +826,7 @@ public:
// semantics. Setting the alignment here allows this IR pass to accurately
// predict the exact constant at which it will be allocated.
- assert(isKernelLDS(func));
+ assert(isKernel(*func));
LLVMContext &Ctx = M.getContext();
const DataLayout &DL = M.getDataLayout();
@@ -878,7 +876,7 @@ public:
for (auto &func : OrderedKernels) {
if (KernelsThatIndirectlyAllocateDynamicLDS.contains(func)) {
- assert(isKernelLDS(func));
+ assert(isKernel(*func));
if (!func->hasName()) {
reportFatalUsageError("anonymous kernels cannot use LDS variables");
}
@@ -912,7 +910,7 @@ public:
auto *I = dyn_cast<Instruction>(U.getUser());
if (!I)
continue;
- if (isKernelLDS(I->getFunction()))
+ if (isKernel(*I->getFunction()))
continue;
replaceUseWithTableLookup(M, Builder, table, GV, U, nullptr);
@@ -922,126 +920,6 @@ public:
return KernelToCreatedDynamicLDS;
}
- static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
- Function *KF) {
- bool NeedsReplacement = false;
- for (Use &U : GV->uses()) {
- if (auto *I = dyn_cast<Instruction>(U.getUser())) {
- Function *F = I->getFunction();
- if (isKernelLDS(F) && F != KF) {
- NeedsReplacement = true;
- break;
- }
- }
- }
- if (!NeedsReplacement)
- return GV;
- // Create a new GV used only by this kernel and its function
- GlobalVariable *NewGV = new GlobalVariable(
- M, GV->getValueType(), GV->isConstant(), GV->getLinkage(),
- GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr,
- GV->getThreadLocalMode(), GV->getType()->getAddressSpace());
- NewGV->copyAttributesFrom(GV);
- for (Use &U : make_early_inc_range(GV->uses())) {
- if (auto *I = dyn_cast<Instruction>(U.getUser())) {
- Function *F = I->getFunction();
- if (!isKernelLDS(F) || F == KF) {
- U.getUser()->replaceUsesOfWith(GV, NewGV);
- }
- }
- }
- return NewGV;
- }
-
- bool lowerSpecialLDSVariables(
- Module &M, LDSUsesInfoTy &LDSUsesInfo,
- VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
- bool Changed = false;
- const DataLayout &DL = M.getDataLayout();
- // The 1st round: give module-absolute assignments
- int NumAbsolutes = 0;
- std::vector<GlobalVariable *> OrderedGVs;
- for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
- GlobalVariable *GV = K.first;
- if (!isNamedBarrier(*GV))
- continue;
- // give a module-absolute assignment if it is indirectly accessed by
- // multiple kernels. This is not precise, but we don't want to duplicate
- // a function when it is called by multiple kernels.
- if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) {
- OrderedGVs.push_back(GV);
- } else {
- // leave it to the 2nd round, which will give a kernel-relative
- // assignment if it is only indirectly accessed by one kernel
- LDSUsesInfo.direct_access[*K.second.begin()].insert(GV);
- }
- LDSToKernelsThatNeedToAccessItIndirectly.erase(GV);
- }
- OrderedGVs = sortByName(std::move(OrderedGVs));
- for (GlobalVariable *GV : OrderedGVs) {
- unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
- unsigned BarId = NumAbsolutes + 1;
- unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
- NumAbsolutes += BarCnt;
-
- // 4 bits for alignment, 5 bits for the barrier num,
- // 3 bits for the barrier scope
- unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
- recordLDSAbsoluteAddress(&M, GV, Offset);
- }
- OrderedGVs.clear();
-
- // The 2nd round: give a kernel-relative assignment for GV that
- // either only indirectly accessed by single kernel or only directly
- // accessed by multiple kernels.
- std::vector<Function *> OrderedKernels;
- for (auto &K : LDSUsesInfo.direct_access) {
- Function *F = K.first;
- assert(isKernelLDS(F));
- OrderedKernels.push_back(F);
- }
- OrderedKernels = sortByName(std::move(OrderedKernels));
-
- llvm::DenseMap<Function *, uint32_t> Kernel2BarId;
- for (Function *F : OrderedKernels) {
- for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) {
- if (!isNamedBarrier(*GV))
- continue;
-
- LDSUsesInfo.direct_access[F].erase(GV);
- if (GV->isAbsoluteSymbolRef()) {
- // already assigned
- continue;
- }
- OrderedGVs.push_back(GV);
- }
- OrderedGVs = sortByName(std::move(OrderedGVs));
- for (GlobalVariable *GV : OrderedGVs) {
- // GV could also be used directly by other kernels. If so, we need to
- // create a new GV used only by this kernel and its function.
- auto NewGV = uniquifyGVPerKernel(M, GV, F);
- Changed |= (NewGV != GV);
- unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
- unsigned BarId = Kernel2BarId[F];
- BarId += NumAbsolutes + 1;
- unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
- Kernel2BarId[F] += BarCnt;
- unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
- recordLDSAbsoluteAddress(&M, NewGV, Offset);
- }
- OrderedGVs.clear();
- }
- // Also erase those special LDS variables from indirect_access.
- for (auto &K : LDSUsesInfo.indirect_access) {
- assert(isKernelLDS(K.first));
- for (GlobalVariable *GV : K.second) {
- if (isNamedBarrier(*GV))
- K.second.erase(GV);
- }
- }
- return Changed;
- }
-
bool runOnModule(Module &M) {
CallGraph CG = CallGraph(M);
bool Changed = superAlignLDSGlobals(M);
@@ -1058,18 +936,12 @@ public:
VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly;
for (auto &K : LDSUsesInfo.indirect_access) {
Function *F = K.first;
- assert(isKernelLDS(F));
+ assert(isKernel(*F));
for (GlobalVariable *GV : K.second) {
LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(F);
}
}
- if (LDSUsesInfo.HasSpecialGVs) {
- // Special LDS variables need special address assignment
- Changed |= lowerSpecialLDSVariables(
- M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly);
- }
-
// Partition variables accessed indirectly into the different strategies
DenseSet<GlobalVariable *> ModuleScopeVariables;
DenseSet<GlobalVariable *> TableLookupVariables;
@@ -1157,7 +1029,7 @@ public:
const DataLayout &DL = M.getDataLayout();
for (Function &Func : M.functions()) {
- if (Func.isDeclaration() || !isKernelLDS(&Func))
+ if (Func.isDeclaration() || !isKernel(Func))
continue;
// All three of these are optional. The first variable is allocated at
@@ -1187,14 +1059,14 @@ public:
if (AllocateModuleScopeStruct) {
// Allocated at zero, recorded once on construction, not once per
// kernel
- Offset += DL.getTypeAllocSize(MaybeModuleScopeStruct->getValueType());
+ Offset += MaybeModuleScopeStruct->getGlobalSize(DL);
}
if (AllocateKernelScopeStruct) {
GlobalVariable *KernelStruct = Replacement->second.SGV;
Offset = alignTo(Offset, AMDGPU::getAlign(DL, KernelStruct));
recordLDSAbsoluteAddress(&M, KernelStruct, Offset);
- Offset += DL.getTypeAllocSize(KernelStruct->getValueType());
+ Offset += KernelStruct->getGlobalSize(DL);
}
// If there is dynamic allocation, the alignment needed is included in
@@ -1264,7 +1136,7 @@ private:
}
Align Alignment = AMDGPU::getAlign(DL, &GV);
- TypeSize GVSize = DL.getTypeAllocSize(GV.getValueType());
+ uint64_t GVSize = GV.getGlobalSize(DL);
if (GVSize > 8) {
// We might want to use a b96 or b128 load/store
@@ -1310,8 +1182,7 @@ private:
LDSVarsToTransform.begin(), LDSVarsToTransform.end()));
for (GlobalVariable *GV : Sorted) {
- OptimizedStructLayoutField F(GV,
- DL.getTypeAllocSize(GV->getValueType()),
+ OptimizedStructLayoutField F(GV, GV->getGlobalSize(DL),
AMDGPU::getAlign(DL, GV));
LayoutFields.emplace_back(F);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
index 1e6589e..f4872ec 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
@@ -43,9 +43,10 @@
#include "AMDGPULowerVGPREncoding.h"
#include "AMDGPU.h"
#include "GCNSubtarget.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIDefines.h"
#include "SIInstrInfo.h"
-#include "llvm/ADT/PackedVector.h"
+#include "llvm/ADT/bit.h"
+#include "llvm/Support/MathExtras.h"
using namespace llvm;
@@ -57,21 +58,44 @@ class AMDGPULowerVGPREncoding {
static constexpr unsigned OpNum = 4;
static constexpr unsigned BitsPerField = 2;
static constexpr unsigned NumFields = 4;
- static constexpr unsigned FieldMask = (1 << BitsPerField) - 1;
- using ModeType = PackedVector<unsigned, BitsPerField,
- std::bitset<BitsPerField * NumFields>>;
+ static constexpr unsigned ModeWidth = NumFields * BitsPerField;
+ static constexpr unsigned ModeMask = (1 << ModeWidth) - 1;
+ static constexpr unsigned VGPRMSBShift =
+ llvm::countr_zero_constexpr<unsigned>(AMDGPU::Hwreg::DST_VGPR_MSB);
- class ModeTy : public ModeType {
- public:
- // bitset constructor will set all bits to zero
- ModeTy() : ModeType(0) {}
+ struct OpMode {
+ // No MSBs set means they are not required to be of a particular value.
+ std::optional<unsigned> MSBits;
+
+ bool update(const OpMode &New, bool &Rewritten) {
+ bool Updated = false;
+ if (New.MSBits) {
+ if (*New.MSBits != MSBits.value_or(0)) {
+ Updated = true;
+ Rewritten |= MSBits.has_value();
+ }
+ MSBits = New.MSBits;
+ }
+ return Updated;
+ }
+ };
+
+ struct ModeTy {
+ OpMode Ops[OpNum];
- operator int64_t() const { return raw_bits().to_ulong(); }
+ bool update(const ModeTy &New, bool &Rewritten) {
+ bool Updated = false;
+ for (unsigned I : seq(OpNum))
+ Updated |= Ops[I].update(New.Ops[I], Rewritten);
+ return Updated;
+ }
- static ModeTy fullMask() {
- ModeTy M;
- M.raw_bits().flip();
- return M;
+ unsigned encode() const {
+ // Layout: [src0 msb, src1 msb, src2 msb, dst msb].
+ unsigned V = 0;
+ for (const auto &[I, Op] : enumerate(Ops))
+ V |= Op.MSBits.value_or(0) << (I * 2);
+ return V;
}
};
@@ -82,19 +106,15 @@ private:
const SIInstrInfo *TII;
const SIRegisterInfo *TRI;
+ // Current basic block.
+ MachineBasicBlock *MBB;
+
/// Most recent s_set_* instruction.
MachineInstr *MostRecentModeSet;
- /// Whether the current mode is known.
- bool CurrentModeKnown;
-
/// Current mode bits.
ModeTy CurrentMode;
- /// Current mask of mode bits that instructions since MostRecentModeSet care
- /// about.
- ModeTy CurrentMask;
-
/// Number of current hard clause instructions.
unsigned ClauseLen;
@@ -108,10 +128,15 @@ private:
MachineInstr *Clause;
/// Insert mode change before \p I. \returns true if mode was changed.
- bool setMode(ModeTy NewMode, ModeTy Mask, MachineInstr *I);
+ bool setMode(ModeTy NewMode, MachineBasicBlock::instr_iterator I);
/// Reset mode to default.
- void resetMode(MachineInstr *I) { setMode(ModeTy(), ModeTy::fullMask(), I); }
+ void resetMode(MachineBasicBlock::instr_iterator I) {
+ ModeTy Mode;
+ for (OpMode &Op : Mode.Ops)
+ Op.MSBits = 0;
+ setMode(Mode, I);
+ }
/// If \p MO references VGPRs, return the MSBs. Otherwise, return nullopt.
std::optional<unsigned> getMSBs(const MachineOperand &MO) const;
@@ -119,49 +144,69 @@ private:
/// Handle single \p MI. \return true if changed.
bool runOnMachineInstr(MachineInstr &MI);
- /// Compute the mode and mode mask for a single \p MI given \p Ops operands
+ /// Compute the mode for a single \p MI given \p Ops operands
/// bit mapping. Optionally takes second array \p Ops2 for VOPD.
/// If provided and an operand from \p Ops is not a VGPR, then \p Ops2
/// is checked.
- void computeMode(ModeTy &NewMode, ModeTy &Mask, MachineInstr &MI,
+ void computeMode(ModeTy &NewMode, MachineInstr &MI,
const AMDGPU::OpName Ops[OpNum],
const AMDGPU::OpName *Ops2 = nullptr);
/// Check if an instruction \p I is within a clause and returns a suitable
/// iterator to insert mode change. It may also modify the S_CLAUSE
/// instruction to extend it or drop the clause if it cannot be adjusted.
- MachineInstr *handleClause(MachineInstr *I);
+ MachineBasicBlock::instr_iterator
+ handleClause(MachineBasicBlock::instr_iterator I);
+
+ /// Check if an instruction \p I is immediately after another program state
+ /// instruction which it cannot coissue with. If so, insert before that
+ /// instruction to encourage more coissuing.
+ MachineBasicBlock::instr_iterator
+ handleCoissue(MachineBasicBlock::instr_iterator I);
+
+ /// Handle S_SETREG_IMM32_B32 targeting MODE register. On certain hardware,
+ /// this instruction clobbers VGPR MSB bits[12:19], so we need to restore
+ /// the current mode. \returns true if the instruction was modified or a
+ /// new one was inserted.
+ bool handleSetregMode(MachineInstr &MI);
+
+ /// Update bits[12:19] of the imm operand in S_SETREG_IMM32_B32 to contain
+ /// the VGPR MSB mode value. \returns true if the immediate was changed.
+ bool updateSetregModeImm(MachineInstr &MI, int64_t ModeValue);
};
-bool AMDGPULowerVGPREncoding::setMode(ModeTy NewMode, ModeTy Mask,
- MachineInstr *I) {
- assert((NewMode.raw_bits() & ~Mask.raw_bits()).none());
+bool AMDGPULowerVGPREncoding::setMode(ModeTy NewMode,
+ MachineBasicBlock::instr_iterator I) {
+ // Record previous mode into high 8 bits of the immediate.
+ int64_t OldModeBits = CurrentMode.encode() << ModeWidth;
- if (CurrentModeKnown) {
- auto Delta = NewMode.raw_bits() ^ CurrentMode.raw_bits();
+ bool Rewritten = false;
+ if (!CurrentMode.update(NewMode, Rewritten))
+ return false;
- if ((Delta & Mask.raw_bits()).none()) {
- CurrentMask |= Mask;
- return false;
+ if (MostRecentModeSet && !Rewritten) {
+ // Update MostRecentModeSet with the new mode. It can be either
+ // S_SET_VGPR_MSB or S_SETREG_IMM32_B32 (with Size <= 12).
+ if (MostRecentModeSet->getOpcode() == AMDGPU::S_SET_VGPR_MSB) {
+ MachineOperand &Op = MostRecentModeSet->getOperand(0);
+ // Carry old mode bits from the existing instruction.
+ int64_t OldModeBits = Op.getImm() & (ModeMask << ModeWidth);
+ Op.setImm(CurrentMode.encode() | OldModeBits);
+ } else {
+ assert(MostRecentModeSet->getOpcode() == AMDGPU::S_SETREG_IMM32_B32 &&
+ "unexpected MostRecentModeSet opcode");
+ updateSetregModeImm(*MostRecentModeSet, CurrentMode.encode());
}
- if (MostRecentModeSet && (Delta & CurrentMask.raw_bits()).none()) {
- CurrentMode |= NewMode;
- CurrentMask |= Mask;
-
- MostRecentModeSet->getOperand(0).setImm(CurrentMode);
- return true;
- }
+ return true;
}
I = handleClause(I);
- MostRecentModeSet =
- BuildMI(*I->getParent(), I, {}, TII->get(AMDGPU::S_SET_VGPR_MSB))
- .addImm(NewMode);
+ I = handleCoissue(I);
+ MostRecentModeSet = BuildMI(*MBB, I, {}, TII->get(AMDGPU::S_SET_VGPR_MSB))
+ .addImm(NewMode.encode() | OldModeBits);
CurrentMode = NewMode;
- CurrentMask = Mask;
- CurrentModeKnown = true;
return true;
}
@@ -179,12 +224,10 @@ AMDGPULowerVGPREncoding::getMSBs(const MachineOperand &MO) const {
return Idx >> 8;
}
-void AMDGPULowerVGPREncoding::computeMode(ModeTy &NewMode, ModeTy &Mask,
- MachineInstr &MI,
+void AMDGPULowerVGPREncoding::computeMode(ModeTy &NewMode, MachineInstr &MI,
const AMDGPU::OpName Ops[OpNum],
const AMDGPU::OpName *Ops2) {
NewMode = {};
- Mask = {};
for (unsigned I = 0; I < OpNum; ++I) {
MachineOperand *Op = TII->getNamedOperand(MI, Ops[I]);
@@ -223,31 +266,31 @@ void AMDGPULowerVGPREncoding::computeMode(ModeTy &NewMode, ModeTy &Mask,
TII->hasVALU32BitEncoding(MI.getOpcode()))))
continue;
- NewMode[I] = MSBits.value();
- Mask[I] = FieldMask;
+ NewMode.Ops[I].MSBits = MSBits.value();
}
}
bool AMDGPULowerVGPREncoding::runOnMachineInstr(MachineInstr &MI) {
auto Ops = AMDGPU::getVGPRLoweringOperandTables(MI.getDesc());
if (Ops.first) {
- ModeTy NewMode, Mask;
- computeMode(NewMode, Mask, MI, Ops.first, Ops.second);
- return setMode(NewMode, Mask, &MI);
+ ModeTy NewMode;
+ computeMode(NewMode, MI, Ops.first, Ops.second);
+ return setMode(NewMode, MI.getIterator());
}
assert(!TII->hasVGPRUses(MI) || MI.isMetaInstruction() || MI.isPseudo());
return false;
}
-MachineInstr *AMDGPULowerVGPREncoding::handleClause(MachineInstr *I) {
+MachineBasicBlock::instr_iterator
+AMDGPULowerVGPREncoding::handleClause(MachineBasicBlock::instr_iterator I) {
if (!ClauseRemaining)
return I;
// A clause cannot start with a special instruction, place it right before
// the clause.
if (ClauseRemaining == ClauseLen) {
- I = Clause->getPrevNode();
+ I = Clause->getPrevNode()->getIterator();
assert(I->isBundle());
return I;
}
@@ -272,6 +315,106 @@ MachineInstr *AMDGPULowerVGPREncoding::handleClause(MachineInstr *I) {
return I;
}
+MachineBasicBlock::instr_iterator
+AMDGPULowerVGPREncoding::handleCoissue(MachineBasicBlock::instr_iterator I) {
+ if (I.isEnd())
+ return I;
+
+ // "Program State instructions" are instructions which are used to control
+ // operation of the GPU rather than performing arithmetic. Such instructions
+ // have different coissuing rules w.r.t s_set_vgpr_msb.
+ auto isProgramStateInstr = [this](MachineInstr *MI) {
+ unsigned Opc = MI->getOpcode();
+ return TII->isBarrier(Opc) || TII->isWaitcnt(Opc) ||
+ Opc == AMDGPU::S_DELAY_ALU;
+ };
+
+ while (!I.isEnd() && I != I->getParent()->begin()) {
+ auto Prev = std::prev(I);
+ if (!isProgramStateInstr(&*Prev))
+ return I;
+ I = Prev;
+ }
+
+ return I;
+}
+
+/// Convert mode value from S_SET_VGPR_MSB format to MODE register format.
+/// S_SET_VGPR_MSB uses: (src0[0-1], src1[2-3], src2[4-5], dst[6-7])
+/// MODE register uses: (dst[0-1], src0[2-3], src1[4-5], src2[6-7])
+/// This is a left rotation by 2 bits on an 8-bit value.
+static int64_t convertModeToSetregFormat(int64_t Mode) {
+ assert(isUInt<8>(Mode) && "Mode expected to be 8-bit");
+ return llvm::rotl<uint8_t>(static_cast<uint8_t>(Mode), /*R=*/2);
+}
+
+bool AMDGPULowerVGPREncoding::updateSetregModeImm(MachineInstr &MI,
+ int64_t ModeValue) {
+ assert(MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32);
+
+ // Convert from S_SET_VGPR_MSB format to MODE register format
+ int64_t SetregMode = convertModeToSetregFormat(ModeValue);
+
+ MachineOperand *ImmOp = TII->getNamedOperand(MI, AMDGPU::OpName::imm);
+ int64_t OldImm = ImmOp->getImm();
+ int64_t NewImm =
+ (OldImm & ~AMDGPU::Hwreg::VGPR_MSB_MASK) | (SetregMode << VGPRMSBShift);
+ ImmOp->setImm(NewImm);
+ return NewImm != OldImm;
+}
+
+bool AMDGPULowerVGPREncoding::handleSetregMode(MachineInstr &MI) {
+ using namespace AMDGPU::Hwreg;
+
+ assert(MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 &&
+ "only S_SETREG_IMM32_B32 needs to be handled");
+
+ MachineOperand *SIMM16Op = TII->getNamedOperand(MI, AMDGPU::OpName::simm16);
+ assert(SIMM16Op && "SIMM16Op must be present");
+
+ auto [HwRegId, Offset, Size] = HwregEncoding::decode(SIMM16Op->getImm());
+ (void)Offset;
+ if (HwRegId != ID_MODE)
+ return false;
+
+ int64_t ModeValue = CurrentMode.encode();
+
+ // Case 1: Size <= 12 - the original instruction uses imm32[0:Size-1], so
+ // imm32[12:19] is unused. Safe to set imm32[12:19] to the correct VGPR
+ // MSBs.
+ if (Size <= VGPRMSBShift) {
+ // This instruction now acts as MostRecentModeSet so it can be updated if
+ // CurrentMode changes via piggybacking.
+ MostRecentModeSet = &MI;
+ return updateSetregModeImm(MI, ModeValue);
+ }
+
+ // Case 2: Size > 12 - the original instruction uses bits beyond 11, so we
+ // cannot arbitrarily modify imm32[12:19]. Check if it already matches VGPR
+ // MSBs. Note: imm32[12:19] is in MODE register format, while ModeValue is
+ // in S_SET_VGPR_MSB format, so we need to convert before comparing.
+ MachineOperand *ImmOp = TII->getNamedOperand(MI, AMDGPU::OpName::imm);
+ assert(ImmOp && "ImmOp must be present");
+ int64_t ImmBits12To19 = (ImmOp->getImm() & VGPR_MSB_MASK) >> VGPRMSBShift;
+ int64_t SetregModeValue = convertModeToSetregFormat(ModeValue);
+ if (ImmBits12To19 == SetregModeValue) {
+ // Already correct, but we must invalidate MostRecentModeSet because this
+ // instruction will overwrite mode[12:19]. We can't update this instruction
+ // via piggybacking (bits[12:19] are meaningful), so if CurrentMode changes,
+ // a new s_set_vgpr_msb will be inserted after this instruction.
+ MostRecentModeSet = nullptr;
+ return false;
+ }
+
+ // imm32[12:19] doesn't match VGPR MSBs - insert s_set_vgpr_msb after
+ // the original instruction to restore the correct value.
+ MachineBasicBlock::iterator InsertPt = std::next(MI.getIterator());
+ MostRecentModeSet = BuildMI(*MBB, InsertPt, MI.getDebugLoc(),
+ TII->get(AMDGPU::S_SET_VGPR_MSB))
+ .addImm(ModeValue);
+ return true;
+}
+
bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
if (!ST.has1024AddressableVGPRs())
@@ -282,11 +425,10 @@ bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) {
bool Changed = false;
ClauseLen = ClauseRemaining = 0;
- CurrentMode.reset();
- CurrentMask.reset();
- CurrentModeKnown = true;
+ CurrentMode = {};
for (auto &MBB : MF) {
MostRecentModeSet = nullptr;
+ this->MBB = &MBB;
for (auto &MI : llvm::make_early_inc_range(MBB.instrs())) {
if (MI.isMetaInstruction())
@@ -294,17 +436,16 @@ bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) {
if (MI.isTerminator() || MI.isCall()) {
if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
- MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
- CurrentMode.reset();
- CurrentModeKnown = true;
- } else
- resetMode(&MI);
+ MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED)
+ CurrentMode = {};
+ else
+ resetMode(MI.getIterator());
continue;
}
if (MI.isInlineAsm()) {
if (TII->hasVGPRUses(MI))
- resetMode(&MI);
+ resetMode(MI.getIterator());
continue;
}
@@ -317,20 +458,20 @@ bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) {
continue;
}
+ if (MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 &&
+ ST.hasSetregVGPRMSBFixup()) {
+ Changed |= handleSetregMode(MI);
+ continue;
+ }
+
Changed |= runOnMachineInstr(MI);
if (ClauseRemaining)
--ClauseRemaining;
}
- // If we're falling through to a block that has at least one other
- // predecessor, we no longer know the mode.
- MachineBasicBlock *Next = MBB.getNextNode();
- if (Next && Next->pred_size() >= 2 &&
- llvm::is_contained(Next->predecessors(), &MBB)) {
- if (CurrentMode.raw_bits().any())
- CurrentModeKnown = false;
- }
+ // Reset the mode if we are falling through.
+ resetMode(MBB.instr_end());
}
return Changed;
@@ -367,7 +508,5 @@ AMDGPULowerVGPREncodingPass::run(MachineFunction &MF,
if (!AMDGPULowerVGPREncoding().run(MF))
return PreservedAnalyses::all();
- PreservedAnalyses PA;
- PA.preserveSet<CFGAnalyses>();
- return PA;
+ return getMachineFunctionPassPreservedAnalyses().preserveSet<CFGAnalyses>();
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 680e7eb..fc408aa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -229,7 +229,8 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
OutMI.addOperand(Src);
return;
} else if (Opcode == AMDGPU::SI_TCRETURN ||
- Opcode == AMDGPU::SI_TCRETURN_GFX) {
+ Opcode == AMDGPU::SI_TCRETURN_GFX ||
+ Opcode == AMDGPU::SI_TCRETURN_CHAIN) {
// TODO: How to use branch immediate and avoid register+add?
Opcode = AMDGPU::S_SETPC_B64;
} else if (AMDGPU::getT16D16Helper(Opcode)) {
@@ -243,7 +244,7 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
int MCOpcode = TII->pseudoToMCOpcode(Opcode);
if (MCOpcode == -1) {
- LLVMContext &C = MI->getParent()->getParent()->getFunction().getContext();
+ LLVMContext &C = MI->getMF()->getFunction().getContext();
C.emitError("AMDGPUMCInstLower::lower - Pseudo instruction doesn't have "
"a target-specific version: " + Twine(MI->getOpcode()));
}
@@ -332,7 +333,7 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
StringRef Err;
if (!STI.getInstrInfo()->verifyInstruction(*MI, Err)) {
- LLVMContext &C = MI->getParent()->getParent()->getFunction().getContext();
+ LLVMContext &C = MI->getMF()->getFunction().getContext();
C.emitError("Illegal instruction detected: " + Err);
MI->print(errs());
}
@@ -346,7 +347,7 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
}
} else {
// We don't want these pseudo instructions encoded. They are
- // placeholder terminator instructions and should only be printed as
+ // placeholder instructions and should only be printed as
// comments.
if (MI->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) {
if (isVerbose())
@@ -360,6 +361,20 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
return;
}
+ if (MI->getOpcode() == AMDGPU::ASYNCMARK) {
+ if (isVerbose())
+ OutStreamer->emitRawComment(" asyncmark");
+ return;
+ }
+
+ if (MI->getOpcode() == AMDGPU::WAIT_ASYNCMARK) {
+ if (isVerbose()) {
+ OutStreamer->emitRawComment(" wait_asyncmark(" +
+ Twine(MI->getOperand(0).getImm()) + ")");
+ }
+ return;
+ }
+
if (MI->getOpcode() == AMDGPU::SCHED_BARRIER) {
if (isVerbose()) {
std::string HexString;
@@ -405,6 +420,16 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
return;
}
+ unsigned Opc = MI->getOpcode();
+ if (LLVM_UNLIKELY(Opc == TargetOpcode::STATEPOINT ||
+ Opc == TargetOpcode::STACKMAP ||
+ Opc == TargetOpcode::PATCHPOINT)) {
+ LLVMContext &Ctx = MI->getMF()->getFunction().getContext();
+ Ctx.emitError("unhandled statepoint-like instruction");
+ OutStreamer->emitRawComment("unsupported statepoint/stackmap/patchpoint");
+ return;
+ }
+
if (isVerbose())
if (STI.getInstrInfo()->isBlockLoadStore(MI->getOpcode()))
emitVGPRBlockComment(MI, STI.getInstrInfo(), STI.getRegisterInfo(),
@@ -412,7 +437,7 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
*OutStreamer);
if (isVerbose() && MI->getOpcode() == AMDGPU::S_SET_VGPR_MSB) {
- unsigned V = MI->getOperand(0).getImm();
+ unsigned V = MI->getOperand(0).getImm() & 0xff;
OutStreamer->AddComment(
" msbs: dst=" + Twine(V >> 6) + " src0=" + Twine(V & 3) +
" src1=" + Twine((V >> 2) & 3) + " src2=" + Twine((V >> 4) & 3));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp
index 75e3d8c..a541a26 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp
@@ -13,13 +13,61 @@
#include "AMDGPUMIRFormatter.h"
#include "SIMachineFunctionInfo.h"
+#include "llvm/TargetParser/TargetParser.h"
using namespace llvm;
+const char SWaitAluImmPrefix = '.';
+StringLiteral SWaitAluDelim = "_";
+
+StringLiteral VaVdstName = "VaVdst";
+StringLiteral VaSdstName = "VaSdst";
+StringLiteral VaSsrcName = "VaSsrc";
+StringLiteral HoldCntName = "HoldCnt";
+StringLiteral VmVsrcName = "VmVsrc";
+StringLiteral VaVccName = "VaVcc";
+StringLiteral SaSdstName = "SaSdst";
+
+StringLiteral AllOff = "AllOff";
+
+void AMDGPUMIRFormatter::printSWaitAluImm(uint64_t Imm, raw_ostream &OS) const {
+ bool NonePrinted = true;
+ ListSeparator Delim(SWaitAluDelim);
+ auto PrintFieldIfNotMax = [&](StringRef Descr, uint64_t Num, unsigned Max) {
+ if (Num != Max) {
+ OS << Delim << Descr << SWaitAluDelim << Num;
+ NonePrinted = false;
+ }
+ };
+ OS << SWaitAluImmPrefix;
+ PrintFieldIfNotMax(VaVdstName, AMDGPU::DepCtr::decodeFieldVaVdst(Imm),
+ AMDGPU::DepCtr::getVaVdstBitMask());
+ PrintFieldIfNotMax(VaSdstName, AMDGPU::DepCtr::decodeFieldVaSdst(Imm),
+ AMDGPU::DepCtr::getVaSdstBitMask());
+ PrintFieldIfNotMax(VaSsrcName, AMDGPU::DepCtr::decodeFieldVaSsrc(Imm),
+ AMDGPU::DepCtr::getVaSsrcBitMask());
+ PrintFieldIfNotMax(
+ HoldCntName,
+ AMDGPU::DepCtr::decodeFieldHoldCnt(Imm,
+ AMDGPU::getIsaVersion(STI.getCPU())),
+ AMDGPU::DepCtr::getHoldCntBitMask(AMDGPU::getIsaVersion(STI.getCPU())));
+ PrintFieldIfNotMax(VmVsrcName, AMDGPU::DepCtr::decodeFieldVmVsrc(Imm),
+ AMDGPU::DepCtr::getVmVsrcBitMask());
+ PrintFieldIfNotMax(VaVccName, AMDGPU::DepCtr::decodeFieldVaVcc(Imm),
+ AMDGPU::DepCtr::getVaVccBitMask());
+ PrintFieldIfNotMax(SaSdstName, AMDGPU::DepCtr::decodeFieldSaSdst(Imm),
+ AMDGPU::DepCtr::getSaSdstBitMask());
+ if (NonePrinted)
+ OS << AllOff;
+}
+
void AMDGPUMIRFormatter::printImm(raw_ostream &OS, const MachineInstr &MI,
std::optional<unsigned int> OpIdx, int64_t Imm) const {
switch (MI.getOpcode()) {
+ case AMDGPU::S_WAITCNT_DEPCTR:
+ printSWaitAluImm(Imm, OS);
+ break;
case AMDGPU::S_DELAY_ALU:
assert(OpIdx == 0);
printSDelayAluImm(Imm, OS);
@@ -39,6 +87,8 @@ bool AMDGPUMIRFormatter::parseImmMnemonic(const unsigned OpCode,
{
switch (OpCode) {
+ case AMDGPU::S_WAITCNT_DEPCTR:
+ return parseSWaitAluImmMnemonic(OpIdx, Imm, Src, ErrorCallback);
case AMDGPU::S_DELAY_ALU:
return parseSDelayAluImmMnemonic(OpIdx, Imm, Src, ErrorCallback);
default:
@@ -90,6 +140,89 @@ void AMDGPUMIRFormatter::printSDelayAluImm(int64_t Imm,
Outdep(Id1);
}
+bool AMDGPUMIRFormatter::parseSWaitAluImmMnemonic(
+ const unsigned int OpIdx, int64_t &Imm, StringRef &Src,
+ MIRFormatter::ErrorCallbackType &ErrorCallback) const {
+ // TODO: For now accept integer masks for compatibility with old MIR.
+ if (!Src.consumeInteger(10, Imm))
+ return false;
+
+ // Initialize with all checks off.
+ Imm = AMDGPU::DepCtr::getDefaultDepCtrEncoding(STI);
+ // The input is in the form: .Name1_Num1_Name2_Num2
+ // Drop the '.' prefix.
+ bool ConsumePrefix = Src.consume_front(SWaitAluImmPrefix);
+ if (!ConsumePrefix)
+ return ErrorCallback(Src.begin(), "expected prefix");
+ if (Src.empty())
+ return ErrorCallback(Src.begin(), "expected <CounterName>_<CounterNum>");
+
+ // Special case for all off.
+ if (Src == AllOff)
+ return false;
+
+ // Parse a counter name, number pair in each iteration.
+ while (!Src.empty()) {
+ // Src: Name1_Num1_Name2_Num2
+ // ^
+ size_t DelimIdx = Src.find(SWaitAluDelim);
+ if (DelimIdx == StringRef::npos)
+ return ErrorCallback(Src.begin(), "expected <CounterName>_<CounterNum>");
+ // Src: Name1_Num1_Name2_Num2
+ // ^^^^^
+ StringRef Name = Src.substr(0, DelimIdx);
+ // Save the position of the name for accurate error reporting.
+ StringRef::iterator NamePos = Src.begin();
+ [[maybe_unused]] bool ConsumeName = Src.consume_front(Name);
+ assert(ConsumeName && "Expected name");
+ [[maybe_unused]] bool ConsumeDelim = Src.consume_front(SWaitAluDelim);
+ assert(ConsumeDelim && "Expected delimiter");
+ // Src: Num1_Name2_Num2
+ // ^
+ DelimIdx = Src.find(SWaitAluDelim);
+ // Src: Num1_Name2_Num2
+ // ^^^^
+ int64_t Num;
+ // Save the position of the number for accurate error reporting.
+ StringRef::iterator NumPos = Src.begin();
+ if (Src.consumeInteger(10, Num) || Num < 0)
+ return ErrorCallback(NumPos,
+ "expected non-negative integer counter number");
+ unsigned Max;
+ if (Name == VaVdstName) {
+ Max = AMDGPU::DepCtr::getVaVdstBitMask();
+ Imm = AMDGPU::DepCtr::encodeFieldVaVdst(Imm, Num);
+ } else if (Name == VmVsrcName) {
+ Max = AMDGPU::DepCtr::getVmVsrcBitMask();
+ Imm = AMDGPU::DepCtr::encodeFieldVmVsrc(Imm, Num);
+ } else if (Name == VaSdstName) {
+ Max = AMDGPU::DepCtr::getVaSdstBitMask();
+ Imm = AMDGPU::DepCtr::encodeFieldVaSdst(Imm, Num);
+ } else if (Name == VaSsrcName) {
+ Max = AMDGPU::DepCtr::getVaSsrcBitMask();
+ Imm = AMDGPU::DepCtr::encodeFieldVaSsrc(Imm, Num);
+ } else if (Name == HoldCntName) {
+ const AMDGPU::IsaVersion &Version = AMDGPU::getIsaVersion(STI.getCPU());
+ Max = AMDGPU::DepCtr::getHoldCntBitMask(Version);
+ Imm = AMDGPU::DepCtr::encodeFieldHoldCnt(Imm, Num, Version);
+ } else if (Name == VaVccName) {
+ Max = AMDGPU::DepCtr::getVaVccBitMask();
+ Imm = AMDGPU::DepCtr::encodeFieldVaVcc(Imm, Num);
+ } else if (Name == SaSdstName) {
+ Max = AMDGPU::DepCtr::getSaSdstBitMask();
+ Imm = AMDGPU::DepCtr::encodeFieldSaSdst(Imm, Num);
+ } else {
+ return ErrorCallback(NamePos, "invalid counter name");
+ }
+ // Don't allow the values to reach their maximum value.
+ if (Num >= Max)
+ return ErrorCallback(NumPos, "counter value too large");
+ // Src: Name2_Num2
+ Src.consume_front(SWaitAluDelim);
+ }
+ return false;
+}
+
bool AMDGPUMIRFormatter::parseSDelayAluImmMnemonic(
const unsigned int OpIdx, int64_t &Imm, llvm::StringRef &Src,
llvm::MIRFormatter::ErrorCallbackType &ErrorCallback) const
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
index c5c9473..dbfc645 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
@@ -16,6 +16,7 @@
#ifndef LLVM_LIB_TARGET_AMDGPUMIRFORMATTER_H
#define LLVM_LIB_TARGET_AMDGPUMIRFORMATTER_H
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/CodeGen/MIRFormatter.h"
namespace llvm {
@@ -25,21 +26,20 @@ struct PerFunctionMIParsingState;
class AMDGPUMIRFormatter final : public MIRFormatter {
public:
- AMDGPUMIRFormatter() = default;
- virtual ~AMDGPUMIRFormatter() = default;
+ explicit AMDGPUMIRFormatter(const MCSubtargetInfo &STI) : STI(STI) {}
+ ~AMDGPUMIRFormatter() override = default;
/// Implement target specific printing for machine operand immediate value, so
/// that we can have more meaningful mnemonic than a 64-bit integer. Passing
/// None to OpIdx means the index is unknown.
- virtual void printImm(raw_ostream &OS, const MachineInstr &MI,
- std::optional<unsigned> OpIdx,
- int64_t Imm) const override;
+ void printImm(raw_ostream &OS, const MachineInstr &MI,
+ std::optional<unsigned> OpIdx, int64_t Imm) const override;
/// Implement target specific parsing of immediate mnemonics. The mnemonic is
/// a string with a leading dot.
- virtual bool parseImmMnemonic(const unsigned OpCode, const unsigned OpIdx,
- StringRef Src, int64_t &Imm,
- ErrorCallbackType ErrorCallback) const override;
+ bool parseImmMnemonic(const unsigned OpCode, const unsigned OpIdx,
+ StringRef Src, int64_t &Imm,
+ ErrorCallbackType ErrorCallback) const override;
/// Implement target specific parsing of target custom pseudo source value.
bool
@@ -49,9 +49,17 @@ public:
ErrorCallbackType ErrorCallback) const override;
private:
+ const MCSubtargetInfo &STI;
+ /// Prints the string to represent s_wait_alu immediate value.
+ void printSWaitAluImm(uint64_t Imm, raw_ostream &OS) const;
/// Print the string to represent s_delay_alu immediate value
void printSDelayAluImm(int64_t Imm, llvm::raw_ostream &OS) const;
+ /// Parse the immediate pseudo literal for s_wait_alu
+ bool parseSWaitAluImmMnemonic(
+ const unsigned int OpIdx, int64_t &Imm, StringRef &Src,
+ MIRFormatter::ErrorCallbackType &ErrorCallback) const;
+
/// Parse the immediate pseudo literal for s_delay_alu
bool parseSDelayAluImmMnemonic(
const unsigned int OpIdx, int64_t &Imm, llvm::StringRef &Src,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 664a15c..1730757 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -80,11 +80,6 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const Function &F,
if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL)
ExplicitKernArgSize = ST.getExplicitKernArgSize(F, MaxKernArgAlign);
- // FIXME: Shouldn't be target specific
- Attribute NSZAttr = F.getFnAttribute("no-signed-zeros-fp-math");
- NoSignedZerosFPMath =
- NSZAttr.isStringAttribute() && NSZAttr.getValueAsString() == "true";
-
const GlobalVariable *DynLdsGlobal = getKernelDynLDSGlobalFromFunction(F);
if (DynLdsGlobal || hasLDSKernelArgument(F))
UsesDynamicLDS = true;
@@ -107,7 +102,7 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
if (!BarAddr)
llvm_unreachable("named barrier should have an assigned address");
Entry.first->second = BarAddr.value();
- unsigned BarCnt = DL.getTypeAllocSize(GV.getValueType()) / 16;
+ unsigned BarCnt = GV.getGlobalSize(DL) / 16;
recordNumNamedBarriers(BarAddr.value(), BarCnt);
return BarAddr.value();
}
@@ -135,8 +130,7 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
// section, and not within some other non-absolute-address object
// allocated here, but the extra error detection is minimal and we would
// have to pass the Function around or cache the attribute value.
- uint32_t ObjectEnd =
- ObjectStart + DL.getTypeAllocSize(GV.getValueType());
+ uint32_t ObjectEnd = ObjectStart + GV.getGlobalSize(DL);
if (ObjectEnd > StaticLDSSize) {
report_fatal_error(
"Absolute address LDS variable outside of static frame");
@@ -152,7 +146,7 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
/// during lowering.
Offset = StaticLDSSize = alignTo(StaticLDSSize, Alignment);
- StaticLDSSize += DL.getTypeAllocSize(GV.getValueType());
+ StaticLDSSize += GV.getGlobalSize(DL);
// Align LDS size to trailing, e.g. for aligning dynamic shared memory
LDSSize = alignTo(StaticLDSSize, Trailing);
@@ -161,7 +155,7 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
"expected region address space");
Offset = StaticGDSSize = alignTo(StaticGDSSize, Alignment);
- StaticGDSSize += DL.getTypeAllocSize(GV.getValueType());
+ StaticGDSSize += GV.getGlobalSize(DL);
// FIXME: Apply alignment of dynamic GDS
GDSSize = StaticGDSSize;
@@ -210,7 +204,7 @@ void AMDGPUMachineFunction::setDynLDSAlign(const Function &F,
const GlobalVariable &GV) {
const Module *M = F.getParent();
const DataLayout &DL = M->getDataLayout();
- assert(DL.getTypeAllocSize(GV.getValueType()).isZero());
+ assert(GV.getGlobalSize(DL) == 0);
Align Alignment =
DL.getValueOrABITypeAlignment(GV.getAlign(), GV.getValueType());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index fc64e16..1317210 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -61,8 +61,6 @@ protected:
// Functions with the amdgpu_cs_chain or amdgpu_cs_chain_preserve CC.
bool IsChainFunction = false;
- bool NoSignedZerosFPMath = false;
-
// Function may be memory bound.
bool MemoryBound = false;
@@ -107,10 +105,6 @@ public:
return isEntryFunction() || isChainFunction();
}
- bool hasNoSignedZerosFPMath() const {
- return NoSignedZerosFPMath;
- }
-
bool isMemoryBound() const {
return MemoryBound;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
index e17c211..9fbb19d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
@@ -70,7 +70,7 @@ bool isDynamicLDS(const GlobalVariable &GV) {
const DataLayout &DL = M->getDataLayout();
if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
return false;
- return DL.getTypeAllocSize(GV.getValueType()) == 0;
+ return GV.getGlobalSize(DL) == 0;
}
bool isLDSVariableToLower(const GlobalVariable &GV) {
@@ -126,7 +126,7 @@ void getUsesOfLDSByFunction(const CallGraph &CG, Module &M,
for (User *V : GV.users()) {
if (auto *I = dyn_cast<Instruction>(V)) {
Function *F = I->getFunction();
- if (isKernelLDS(F))
+ if (isKernel(*F))
kernels[F].insert(&GV);
else
Functions[F].insert(&GV);
@@ -135,10 +135,6 @@ void getUsesOfLDSByFunction(const CallGraph &CG, Module &M,
}
}
-bool isKernelLDS(const Function *F) {
- return AMDGPU::isKernel(F->getCallingConv());
-}
-
LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
FunctionVariableMap DirectMapKernel;
@@ -148,7 +144,7 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
// Collect functions whose address has escaped
DenseSet<Function *> AddressTakenFuncs;
for (Function &F : M.functions()) {
- if (!isKernelLDS(&F))
+ if (!isKernel(F))
if (F.hasAddressTaken(nullptr,
/* IgnoreCallbackUses */ false,
/* IgnoreAssumeLikeCalls */ false,
@@ -180,7 +176,7 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
// access all variables accessed by functions whose address escaped
for (Function &F : M.functions()) {
if (!F.isDeclaration() && FunctionMakesUnknownCall(&F)) {
- if (!isKernelLDS(&F)) {
+ if (!isKernel(F)) {
set_union(TransitiveMapFunction[&F],
VariablesReachableThroughFunctionPointer);
}
@@ -190,7 +186,7 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
// Direct implementation of collecting all variables reachable from each
// function
for (Function &Func : M.functions()) {
- if (Func.isDeclaration() || isKernelLDS(&Func))
+ if (Func.isDeclaration() || isKernel(Func))
continue;
DenseSet<Function *> seen; // catches cycles
@@ -227,7 +223,7 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
FunctionVariableMap IndirectMapKernel;
for (Function &Func : M.functions()) {
- if (Func.isDeclaration() || !isKernelLDS(&Func))
+ if (Func.isDeclaration() || !isKernel(Func))
continue;
for (const CallGraphNode::CallRecord &R : *CG[&Func]) {
@@ -273,6 +269,8 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
// this is a re-run of the pass
// so we don't have anything to do.
// - No variables are absolute.
+ // Named-barriers which are absolute symbols are removed
+ // from the maps.
std::optional<bool> HasAbsoluteGVs;
bool HasSpecialGVs = false;
for (auto &Map : {DirectMapKernel, IndirectMapKernel}) {
@@ -284,6 +282,10 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
if (IsDirectMapDynLDSGV)
continue;
if (isNamedBarrier(*GV)) {
+ if (IsAbsolute) {
+ DirectMapKernel[Fn].erase(GV);
+ IndirectMapKernel[Fn].erase(GV);
+ }
HasSpecialGVs = true;
continue;
}
@@ -335,7 +337,7 @@ void removeFnAttrFromReachable(CallGraph &CG, Function *KernelRoot,
Function *PotentialCallee =
ExternalCallRecord.second->getFunction();
assert(PotentialCallee);
- if (!isKernelLDS(PotentialCallee)) {
+ if (!isKernel(*PotentialCallee)) {
for (StringRef Attr : FnAttrs)
PotentialCallee->removeFnAttr(Attr);
}
@@ -369,6 +371,7 @@ bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) {
case Intrinsic::amdgcn_s_barrier_wait:
case Intrinsic::amdgcn_s_barrier_leave:
case Intrinsic::amdgcn_s_get_barrier_state:
+ case Intrinsic::amdgcn_s_wakeup_barrier:
case Intrinsic::amdgcn_wave_barrier:
case Intrinsic::amdgcn_sched_barrier:
case Intrinsic::amdgcn_sched_group_barrier:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h
index 058e744..8868b93 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h
@@ -53,8 +53,6 @@ void getUsesOfLDSByFunction(const CallGraph &CG, Module &M,
FunctionVariableMap &kernels,
FunctionVariableMap &functions);
-bool isKernelLDS(const Function *F);
-
LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M);
/// Strip FnAttr attribute from any functions where we may have
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index a6074ea..f464fbf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -13,6 +13,12 @@
// NOTE: NO INCLUDE GUARD DESIRED!
+#ifndef MODULE_ANALYSIS
+#define MODULE_ANALYSIS(NAME, CREATE_PASS)
+#endif
+MODULE_ANALYSIS("amdgpu-argument-usage", AMDGPUArgumentUsageAnalysis())
+#undef MODULE_ANALYSIS
+
#ifndef MODULE_PASS
#define MODULE_PASS(NAME, CREATE_PASS)
#endif
@@ -29,8 +35,8 @@ MODULE_PASS("amdgpu-perf-hint",
MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass(*this))
MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this))
+MODULE_PASS("amdgpu-lower-exec-sync", AMDGPULowerExecSyncPass())
MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this))
-MODULE_PASS("amdgpu-uniform-intrinsic-combine", AMDGPUUniformIntrinsicCombinePass())
#undef MODULE_PASS
#ifndef MODULE_PASS_WITH_PARAMS
@@ -69,6 +75,7 @@ FUNCTION_PASS("amdgpu-unify-divergent-exit-nodes",
AMDGPUUnifyDivergentExitNodesPass())
FUNCTION_PASS("amdgpu-usenative", AMDGPUUseNativeCallsPass())
FUNCTION_PASS("si-annotate-control-flow", SIAnnotateControlFlowPass(*static_cast<const GCNTargetMachine *>(this)))
+FUNCTION_PASS("amdgpu-uniform-intrinsic-combine", AMDGPUUniformIntrinsicCombinePass())
#undef FUNCTION_PASS
#ifndef FUNCTION_ANALYSIS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
index cf2ab825..a3be0f5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
@@ -48,7 +48,7 @@ private:
FuncInfoMap FIM;
public:
- AMDGPUPerfHintAnalysis() {}
+ AMDGPUPerfHintAnalysis() = default;
// OldPM
bool runOnSCC(const GCNTargetMachine &TM, CallGraphSCC &SCC);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index e86b473..0264d88 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -367,10 +367,10 @@ bool AMDGPUPostLegalizerCombinerImpl::matchRemoveFcanonicalize(
return TLI->isCanonicalized(Reg, MF);
}
-// The buffer_load_{i8, i16} intrinsics are intially lowered as buffer_load_{u8,
-// u16} instructions. Here, the buffer_load_{u8, u16} instructions are combined
-// with sign extension instrucions in order to generate buffer_load_{i8, i16}
-// instructions.
+// The buffer_load_{i8, i16} intrinsics are initially lowered as
+// buffer_load_{u8, u16} instructions. Here, the buffer_load_{u8, u16}
+// instructions are combined with sign extension instrucions in order to
+// generate buffer_load_{i8, i16} instructions.
// Identify buffer_load_{u8, u16}.
bool AMDGPUPostLegalizerCombinerImpl::matchCombineSignExtendInReg(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
index 6e54737..4a70c5d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
@@ -177,8 +177,7 @@ void AMDGPUPreLegalizerCombinerImpl::applyClampI64ToI16(
MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) const {
Register Src = MatchInfo.Origin;
- assert(MI.getParent()->getParent()->getRegInfo().getType(Src) ==
- LLT::scalar(64));
+ assert(MI.getMF()->getRegInfo().getType(Src) == LLT::scalar(64));
const LLT S32 = LLT::scalar(32);
auto Unmerge = B.buildUnmerge(S32, Src);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp
index ffbbf63..7d6e3ed 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp
@@ -127,7 +127,7 @@ private:
// will also be preloaded even if that data is unused.
Function *cloneFunctionWithPreloadImplicitArgs(unsigned LastPreloadIndex) {
FunctionType *FT = F.getFunctionType();
- LLVMContext &Ctx = F.getParent()->getContext();
+ LLVMContext &Ctx = F.getContext();
SmallVector<Type *, 16> FTypes(FT->param_begin(), FT->param_end());
for (unsigned I = 0; I <= LastPreloadIndex; ++I)
FTypes.push_back(getHiddenArgType(Ctx, HiddenArg(I)));
@@ -196,7 +196,7 @@ public:
SmallVector<std::pair<LoadInst *, unsigned>, 4> ImplicitArgLoads;
for (auto *U : ImplicitArgPtr->users()) {
Instruction *CI = dyn_cast<Instruction>(U);
- if (!CI || CI->getParent()->getParent() != &F)
+ if (!CI || CI->getFunction() != &F)
continue;
for (auto *U : CI->users()) {
@@ -213,7 +213,7 @@ public:
continue;
// FIXME: Expand handle merged loads.
- LLVMContext &Ctx = F.getParent()->getContext();
+ LLVMContext &Ctx = F.getContext();
Type *LoadTy = Load->getType();
HiddenArg HA = getHiddenArgFromOffset(Offset);
if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType(Ctx, HA))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp
index 0137b3f..a43600a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp
@@ -46,10 +46,7 @@ class AMDGPUPrepareAGPRAllocLegacy : public MachineFunctionPass {
public:
static char ID;
- AMDGPUPrepareAGPRAllocLegacy() : MachineFunctionPass(ID) {
- initializeAMDGPUPrepareAGPRAllocLegacyPass(
- *PassRegistry::getPassRegistry());
- }
+ AMDGPUPrepareAGPRAllocLegacy() : MachineFunctionPass(ID) {}
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -62,10 +59,8 @@ public:
};
} // End anonymous namespace.
-INITIALIZE_PASS_BEGIN(AMDGPUPrepareAGPRAllocLegacy, DEBUG_TYPE,
- "AMDGPU Prepare AGPR Alloc", false, false)
-INITIALIZE_PASS_END(AMDGPUPrepareAGPRAllocLegacy, DEBUG_TYPE,
- "AMDGPU Prepare AGPR Alloc", false, false)
+INITIALIZE_PASS(AMDGPUPrepareAGPRAllocLegacy, DEBUG_TYPE,
+ "AMDGPU Prepare AGPR Alloc", false, false)
char AMDGPUPrepareAGPRAllocLegacy::ID = 0;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
index f5e14c7..d3fa423 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
@@ -129,7 +129,7 @@ static StringRef getAsConstantStr(Value *V) {
static void diagnoseInvalidFormatString(const CallBase *CI) {
CI->getContext().diagnose(DiagnosticInfoUnsupported(
- *CI->getParent()->getParent(),
+ *CI->getFunction(),
"printf format string must be a trivially resolved constant string "
"global variable",
CI->getDebugLoc()));
@@ -416,9 +416,13 @@ bool AMDGPUPrintfRuntimeBindingImpl::lowerPrintfForGpu(Module &M) {
}
}
- // erase the printf calls
- for (auto *CI : Printfs)
+ // Erase the printf calls and replace all uses with 0, signaling success.
+ // Since OpenCL only specifies undefined behaviors and not success criteria,
+ // returning 0 sinalling success always is valid.
+ for (auto *CI : Printfs) {
+ CI->replaceAllUsesWith(ConstantInt::get(CI->getType(), 0));
CI->eraseFromParent();
+ }
Printfs.clear();
return true;
@@ -434,6 +438,17 @@ bool AMDGPUPrintfRuntimeBindingImpl::run(Module &M) {
M.getModuleFlag("openmp"))
return false;
+ // Verify the signature of the printf function and skip if it isn't correct.
+ const FunctionType *PrintfFunctionTy = PrintfFunction->getFunctionType();
+ if (PrintfFunctionTy->getNumParams() != 1 || !PrintfFunctionTy->isVarArg() ||
+ !PrintfFunctionTy->getReturnType()->isIntegerTy(32))
+ return false;
+ Type *PrintfFormatArgTy = PrintfFunctionTy->getParamType(0);
+ if (!PrintfFormatArgTy->isPointerTy() ||
+ !AMDGPU::isFlatGlobalAddrSpace(
+ PrintfFormatArgTy->getPointerAddressSpace()))
+ return false;
+
for (auto &U : PrintfFunction->uses()) {
if (auto *CI = dyn_cast<CallInst>(U.getUser())) {
if (CI->isCallee(&U) && !CI->isNoBuiltin())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index ddabd25..ed676c3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -42,6 +42,7 @@
#include "llvm/IR/PatternMatch.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
+#include "llvm/Support/MathExtras.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/Utils/SSAUpdater.h"
@@ -85,6 +86,42 @@ static cl::opt<unsigned>
"when sorting profitable allocas"),
cl::init(4));
+// We support vector indices of the form (A * stride) + B
+// All parts are optional.
+struct GEPToVectorIndex {
+ Value *VarIndex = nullptr; // defaults to 0
+ ConstantInt *VarMul = nullptr; // defaults to 1
+ ConstantInt *ConstIndex = nullptr; // defaults to 0
+ Value *Full = nullptr;
+};
+
+struct MemTransferInfo {
+ ConstantInt *SrcIndex = nullptr;
+ ConstantInt *DestIndex = nullptr;
+};
+
+// Analysis for planning the different strategies of alloca promotion.
+struct AllocaAnalysis {
+ AllocaInst *Alloca = nullptr;
+ DenseSet<Value *> Pointers;
+ SmallVector<Use *> Uses;
+ unsigned Score = 0;
+ bool HaveSelectOrPHI = false;
+ struct {
+ FixedVectorType *Ty = nullptr;
+ SmallVector<Instruction *> Worklist;
+ SmallVector<Instruction *> UsersToRemove;
+ MapVector<GetElementPtrInst *, GEPToVectorIndex> GEPVectorIdx;
+ MapVector<MemTransferInst *, MemTransferInfo> TransferInfo;
+ } Vector;
+ struct {
+ bool Enable = false;
+ SmallVector<User *> Worklist;
+ } LDS;
+
+ explicit AllocaAnalysis(AllocaInst *Alloca) : Alloca(Alloca) {}
+};
+
// Shared implementation which can do both promotion to vector and to LDS.
class AMDGPUPromoteAllocaImpl {
private:
@@ -106,10 +143,7 @@ private:
std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
- /// BaseAlloca is the alloca root the search started from.
- /// Val may be that alloca or a recursive user of it.
- bool collectUsesWithPtrTypes(Value *BaseAlloca, Value *Val,
- std::vector<Value *> &WorkList) const;
+ bool collectAllocaUses(AllocaAnalysis &AA) const;
/// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
/// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
@@ -122,10 +156,16 @@ private:
/// Check whether we have enough local memory for promotion.
bool hasSufficientLocalMem(const Function &F);
- bool tryPromoteAllocaToVector(AllocaInst &I);
- bool tryPromoteAllocaToLDS(AllocaInst &I, bool SufficientLDS);
+ FixedVectorType *getVectorTypeForAlloca(Type *AllocaTy) const;
+ void analyzePromoteToVector(AllocaAnalysis &AA) const;
+ void promoteAllocaToVector(AllocaAnalysis &AA);
+ void analyzePromoteToLDS(AllocaAnalysis &AA) const;
+ bool tryPromoteAllocaToLDS(AllocaAnalysis &AA, bool SufficientLDS,
+ SetVector<IntrinsicInst *> &DeferredIntrs);
+ void
+ finishDeferredAllocaToLDSPromotion(SetVector<IntrinsicInst *> &DeferredIntrs);
- void sortAllocasToPromote(SmallVectorImpl<AllocaInst *> &Allocas);
+ void scoreAlloca(AllocaAnalysis &AA) const;
void setFunctionLimits(const Function &F);
@@ -236,53 +276,77 @@ FunctionPass *llvm::createAMDGPUPromoteAlloca() {
return new AMDGPUPromoteAlloca();
}
-static void collectAllocaUses(AllocaInst &Alloca,
- SmallVectorImpl<Use *> &Uses) {
- SmallVector<Instruction *, 4> WorkList({&Alloca});
+bool AMDGPUPromoteAllocaImpl::collectAllocaUses(AllocaAnalysis &AA) const {
+ const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
+ LLVM_DEBUG(dbgs() << " Cannot promote alloca: " << Msg << "\n"
+ << " " << *Inst << "\n");
+ return false;
+ };
+
+ SmallVector<Instruction *, 4> WorkList({AA.Alloca});
while (!WorkList.empty()) {
auto *Cur = WorkList.pop_back_val();
+ if (find(AA.Pointers, Cur) != AA.Pointers.end())
+ continue;
+ AA.Pointers.insert(Cur);
for (auto &U : Cur->uses()) {
- Uses.push_back(&U);
+ auto *Inst = cast<Instruction>(U.getUser());
+ if (isa<StoreInst>(Inst)) {
+ if (U.getOperandNo() != StoreInst::getPointerOperandIndex()) {
+ return RejectUser(Inst, "pointer escapes via store");
+ }
+ }
+ AA.Uses.push_back(&U);
+
+ if (isa<GetElementPtrInst>(U.getUser())) {
+ WorkList.push_back(Inst);
+ } else if (auto *SI = dyn_cast<SelectInst>(Inst)) {
+ // Only promote a select if we know that the other select operand is
+ // from another pointer that will also be promoted.
+ if (!binaryOpIsDerivedFromSameAlloca(AA.Alloca, Cur, SI, 1, 2))
+ return RejectUser(Inst, "select from mixed objects");
+ WorkList.push_back(Inst);
+ AA.HaveSelectOrPHI = true;
+ } else if (auto *Phi = dyn_cast<PHINode>(Inst)) {
+ // Repeat for phis.
+
+ // TODO: Handle more complex cases. We should be able to replace loops
+ // over arrays.
+ switch (Phi->getNumIncomingValues()) {
+ case 1:
+ break;
+ case 2:
+ if (!binaryOpIsDerivedFromSameAlloca(AA.Alloca, Cur, Phi, 0, 1))
+ return RejectUser(Inst, "phi from mixed objects");
+ break;
+ default:
+ return RejectUser(Inst, "phi with too many operands");
+ }
- if (isa<GetElementPtrInst>(U.getUser()))
- WorkList.push_back(cast<Instruction>(U.getUser()));
+ WorkList.push_back(Inst);
+ AA.HaveSelectOrPHI = true;
+ }
}
}
+ return true;
}
-void AMDGPUPromoteAllocaImpl::sortAllocasToPromote(
- SmallVectorImpl<AllocaInst *> &Allocas) {
- DenseMap<AllocaInst *, unsigned> Scores;
-
- for (auto *Alloca : Allocas) {
- LLVM_DEBUG(dbgs() << "Scoring: " << *Alloca << "\n");
- unsigned &Score = Scores[Alloca];
- // Increment score by one for each user + a bonus for users within loops.
- SmallVector<Use *, 8> Uses;
- collectAllocaUses(*Alloca, Uses);
- for (auto *U : Uses) {
- Instruction *Inst = cast<Instruction>(U->getUser());
- if (isa<GetElementPtrInst>(Inst))
- continue;
- unsigned UserScore =
- 1 + (LoopUserWeight * LI.getLoopDepth(Inst->getParent()));
- LLVM_DEBUG(dbgs() << " [+" << UserScore << "]:\t" << *Inst << "\n");
- Score += UserScore;
- }
- LLVM_DEBUG(dbgs() << " => Final Score:" << Score << "\n");
+void AMDGPUPromoteAllocaImpl::scoreAlloca(AllocaAnalysis &AA) const {
+ LLVM_DEBUG(dbgs() << "Scoring: " << *AA.Alloca << "\n");
+ unsigned Score = 0;
+ // Increment score by one for each user + a bonus for users within loops.
+ for (auto *U : AA.Uses) {
+ Instruction *Inst = cast<Instruction>(U->getUser());
+ if (isa<GetElementPtrInst>(Inst) || isa<SelectInst>(Inst) ||
+ isa<PHINode>(Inst))
+ continue;
+ unsigned UserScore =
+ 1 + (LoopUserWeight * LI.getLoopDepth(Inst->getParent()));
+ LLVM_DEBUG(dbgs() << " [+" << UserScore << "]:\t" << *Inst << "\n");
+ Score += UserScore;
}
-
- stable_sort(Allocas, [&](AllocaInst *A, AllocaInst *B) {
- return Scores.at(A) > Scores.at(B);
- });
-
- // clang-format off
- LLVM_DEBUG(
- dbgs() << "Sorted Worklist:\n";
- for (auto *A: Allocas)
- dbgs() << " " << *A << "\n";
- );
- // clang-format on
+ LLVM_DEBUG(dbgs() << " => Final Score:" << Score << "\n");
+ AA.Score = Score;
}
void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) {
@@ -307,7 +371,7 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
DL = &Mod->getDataLayout();
const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
- if (!ST.isPromoteAllocaEnabled())
+ if (!ST.enablePromoteAlloca())
return false;
bool SufficientLDS = PromoteToLDS && hasSufficientLocalMem(F);
@@ -319,27 +383,49 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
: (MaxVGPRs * 32)) /
VGPRBudgetRatio;
- SmallVector<AllocaInst *, 16> Allocas;
+ std::vector<AllocaAnalysis> Allocas;
for (Instruction &I : F.getEntryBlock()) {
if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
// Array allocations are probably not worth handling, since an allocation
// of the array type is the canonical form.
if (!AI->isStaticAlloca() || AI->isArrayAllocation())
continue;
- Allocas.push_back(AI);
+
+ LLVM_DEBUG(dbgs() << "Analyzing: " << *AI << '\n');
+
+ AllocaAnalysis AA{AI};
+ if (collectAllocaUses(AA)) {
+ analyzePromoteToVector(AA);
+ if (PromoteToLDS)
+ analyzePromoteToLDS(AA);
+ if (AA.Vector.Ty || AA.LDS.Enable) {
+ scoreAlloca(AA);
+ Allocas.push_back(std::move(AA));
+ }
+ }
}
}
- sortAllocasToPromote(Allocas);
+ stable_sort(Allocas,
+ [](const auto &A, const auto &B) { return A.Score > B.Score; });
+
+ // clang-format off
+ LLVM_DEBUG(
+ dbgs() << "Sorted Worklist:\n";
+ for (const auto &AA : Allocas)
+ dbgs() << " " << *AA.Alloca << "\n";
+ );
+ // clang-format on
bool Changed = false;
- for (AllocaInst *AI : Allocas) {
- const unsigned AllocaCost = DL->getTypeSizeInBits(AI->getAllocatedType());
- // First, check if we have enough budget to vectorize this alloca.
- if (AllocaCost <= VectorizationBudget) {
- // If we do, attempt vectorization, otherwise, fall through and try
- // promoting to LDS instead.
- if (tryPromoteAllocaToVector(*AI)) {
+ SetVector<IntrinsicInst *> DeferredIntrs;
+ for (AllocaAnalysis &AA : Allocas) {
+ if (AA.Vector.Ty) {
+ const unsigned AllocaCost =
+ DL->getTypeSizeInBits(AA.Alloca->getAllocatedType());
+ // First, check if we have enough budget to vectorize this alloca.
+ if (AllocaCost <= VectorizationBudget) {
+ promoteAllocaToVector(AA);
Changed = true;
assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
"Underflow!");
@@ -347,16 +433,18 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
LLVM_DEBUG(dbgs() << " Remaining vectorization budget:"
<< VectorizationBudget << "\n");
continue;
+ } else {
+ LLVM_DEBUG(dbgs() << "Alloca too big for vectorization (size:"
+ << AllocaCost << ", budget:" << VectorizationBudget
+ << "): " << *AA.Alloca << "\n");
}
- } else {
- LLVM_DEBUG(dbgs() << "Alloca too big for vectorization (size:"
- << AllocaCost << ", budget:" << VectorizationBudget
- << "): " << *AI << "\n");
}
- if (PromoteToLDS && tryPromoteAllocaToLDS(*AI, SufficientLDS))
+ if (AA.LDS.Enable &&
+ tryPromoteAllocaToLDS(AA, SufficientLDS, DeferredIntrs))
Changed = true;
}
+ finishDeferredAllocaToLDSPromotion(DeferredIntrs);
// NOTE: tryPromoteAllocaToVector removes the alloca, so Allocas contains
// dangling pointers. If we want to reuse it past this point, the loop above
@@ -365,11 +453,6 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
return Changed;
}
-struct MemTransferInfo {
- ConstantInt *SrcIndex = nullptr;
- ConstantInt *DestIndex = nullptr;
-};
-
// Checks if the instruction I is a memset user of the alloca AI that we can
// deal with. Currently, only non-volatile memsets that affect the whole alloca
// are handled.
@@ -387,23 +470,48 @@ static bool isSupportedMemset(MemSetInst *I, AllocaInst *AI,
match(I->getOperand(2), m_SpecificInt(Size)) && !I->isVolatile();
}
-static Value *calculateVectorIndex(
- Value *Ptr, const std::map<GetElementPtrInst *, WeakTrackingVH> &GEPIdx) {
- auto *GEP = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts());
- if (!GEP)
- return ConstantInt::getNullValue(Type::getInt32Ty(Ptr->getContext()));
+static Value *calculateVectorIndex(Value *Ptr, AllocaAnalysis &AA) {
+ IRBuilder<> B(Ptr->getContext());
+
+ Ptr = Ptr->stripPointerCasts();
+ if (Ptr == AA.Alloca)
+ return B.getInt32(0);
+
+ auto *GEP = cast<GetElementPtrInst>(Ptr);
+ auto I = AA.Vector.GEPVectorIdx.find(GEP);
+ assert(I != AA.Vector.GEPVectorIdx.end() && "Must have entry for GEP!");
+
+ if (!I->second.Full) {
+ Value *Result = nullptr;
+ B.SetInsertPoint(GEP);
+
+ if (I->second.VarIndex) {
+ Result = I->second.VarIndex;
+ Result = B.CreateSExtOrTrunc(Result, B.getInt32Ty());
- auto I = GEPIdx.find(GEP);
- assert(I != GEPIdx.end() && "Must have entry for GEP!");
+ if (I->second.VarMul)
+ Result = B.CreateMul(Result, I->second.VarMul);
+ }
+
+ if (I->second.ConstIndex) {
+ if (Result)
+ Result = B.CreateAdd(Result, I->second.ConstIndex);
+ else
+ Result = I->second.ConstIndex;
+ }
+
+ if (!Result)
+ Result = B.getInt32(0);
+
+ I->second.Full = Result;
+ }
- Value *IndexValue = I->second;
- assert(IndexValue && "index value missing from GEP index map");
- return IndexValue;
+ return I->second.Full;
}
-static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
- Type *VecElemTy, const DataLayout &DL,
- SmallVector<Instruction *> &NewInsts) {
+static std::optional<GEPToVectorIndex>
+computeGEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
+ Type *VecElemTy, const DataLayout &DL) {
// TODO: Extracting a "multiple of X" from a GEP might be a useful generic
// helper.
LLVMContext &Ctx = GEP->getContext();
@@ -431,7 +539,7 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
Value *CurPtr = GEP;
while (auto *CurGEP = dyn_cast<GetElementPtrInst>(CurPtr)) {
if (!CurGEP->collectOffset(DL, BW, VarOffsets, ConstOffset))
- return nullptr;
+ return {};
// Move to the next outer pointer.
CurPtr = CurGEP->getPointerOperand();
@@ -441,126 +549,78 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
int64_t VecElemSize = DL.getTypeAllocSize(VecElemTy);
if (VarOffsets.size() > 1)
- return nullptr;
+ return {};
APInt IndexQuot;
int64_t Rem;
APInt::sdivrem(ConstOffset, VecElemSize, IndexQuot, Rem);
if (Rem != 0)
- return nullptr;
- if (VarOffsets.size() == 0)
- return ConstantInt::get(Ctx, IndexQuot);
+ return {};
- IRBuilder<> Builder(GEP);
+ GEPToVectorIndex Result;
+
+ if (!ConstOffset.isZero())
+ Result.ConstIndex = ConstantInt::get(Ctx, IndexQuot.sextOrTrunc(BW));
+
+ if (VarOffsets.empty())
+ return Result;
const auto &VarOffset = VarOffsets.front();
APInt OffsetQuot;
APInt::sdivrem(VarOffset.second, VecElemSize, OffsetQuot, Rem);
if (Rem != 0 || OffsetQuot.isZero())
- return nullptr;
+ return {};
- Value *Offset = VarOffset.first;
- auto *OffsetType = dyn_cast<IntegerType>(Offset->getType());
+ Result.VarIndex = VarOffset.first;
+ auto *OffsetType = dyn_cast<IntegerType>(Result.VarIndex->getType());
if (!OffsetType)
- return nullptr;
+ return {};
- if (!OffsetQuot.isOne()) {
- ConstantInt *ConstMul =
- ConstantInt::get(Ctx, OffsetQuot.sext(OffsetType->getBitWidth()));
- Offset = Builder.CreateMul(Offset, ConstMul);
- if (Instruction *NewInst = dyn_cast<Instruction>(Offset))
- NewInsts.push_back(NewInst);
- }
- if (ConstOffset.isZero())
- return Offset;
-
- ConstantInt *ConstIndex =
- ConstantInt::get(Ctx, IndexQuot.sext(OffsetType->getBitWidth()));
- Value *IndexAdd = Builder.CreateAdd(Offset, ConstIndex);
- if (Instruction *NewInst = dyn_cast<Instruction>(IndexAdd))
- NewInsts.push_back(NewInst);
- return IndexAdd;
+ if (!OffsetQuot.isOne())
+ Result.VarMul = ConstantInt::get(Ctx, OffsetQuot.sextOrTrunc(BW));
+
+ return Result;
}
/// Promotes a single user of the alloca to a vector form.
///
/// \param Inst Instruction to be promoted.
/// \param DL Module Data Layout.
-/// \param VectorTy Vectorized Type.
+/// \param AA Alloca Analysis.
/// \param VecStoreSize Size of \p VectorTy in bytes.
/// \param ElementSize Size of \p VectorTy element type in bytes.
-/// \param TransferInfo MemTransferInst info map.
-/// \param GEPVectorIdx GEP -> VectorIdx cache.
/// \param CurVal Current value of the vector (e.g. last stored value)
/// \param[out] DeferredLoads \p Inst is added to this vector if it can't
/// be promoted now. This happens when promoting requires \p
/// CurVal, but \p CurVal is nullptr.
/// \return the stored value if \p Inst would have written to the alloca, or
/// nullptr otherwise.
-static Value *promoteAllocaUserToVector(
- Instruction *Inst, const DataLayout &DL, FixedVectorType *VectorTy,
- unsigned VecStoreSize, unsigned ElementSize,
- DenseMap<MemTransferInst *, MemTransferInfo> &TransferInfo,
- std::map<GetElementPtrInst *, WeakTrackingVH> &GEPVectorIdx, Value *CurVal,
- SmallVectorImpl<LoadInst *> &DeferredLoads) {
+static Value *promoteAllocaUserToVector(Instruction *Inst, const DataLayout &DL,
+ AllocaAnalysis &AA,
+ unsigned VecStoreSize,
+ unsigned ElementSize,
+ function_ref<Value *()> GetCurVal) {
// Note: we use InstSimplifyFolder because it can leverage the DataLayout
// to do more folding, especially in the case of vector splats.
IRBuilder<InstSimplifyFolder> Builder(Inst->getContext(),
InstSimplifyFolder(DL));
Builder.SetInsertPoint(Inst);
- const auto GetOrLoadCurrentVectorValue = [&]() -> Value * {
- if (CurVal)
- return CurVal;
-
- // If the current value is not known, insert a dummy load and lower it on
- // the second pass.
- LoadInst *Dummy =
- Builder.CreateLoad(VectorTy, PoisonValue::get(Builder.getPtrTy()),
- "promotealloca.dummyload");
- DeferredLoads.push_back(Dummy);
- return Dummy;
- };
-
- const auto CreateTempPtrIntCast = [&Builder, DL](Value *Val,
- Type *PtrTy) -> Value * {
- assert(DL.getTypeStoreSize(Val->getType()) == DL.getTypeStoreSize(PtrTy));
- const unsigned Size = DL.getTypeStoreSizeInBits(PtrTy);
- if (!PtrTy->isVectorTy())
- return Builder.CreateBitOrPointerCast(Val, Builder.getIntNTy(Size));
- const unsigned NumPtrElts = cast<FixedVectorType>(PtrTy)->getNumElements();
- // If we want to cast to cast, e.g. a <2 x ptr> into a <4 x i32>, we need to
- // first cast the ptr vector to <2 x i64>.
- assert((Size % NumPtrElts == 0) && "Vector size not divisble");
- Type *EltTy = Builder.getIntNTy(Size / NumPtrElts);
- return Builder.CreateBitOrPointerCast(
- Val, FixedVectorType::get(EltTy, NumPtrElts));
- };
-
- Type *VecEltTy = VectorTy->getElementType();
+ Type *VecEltTy = AA.Vector.Ty->getElementType();
switch (Inst->getOpcode()) {
case Instruction::Load: {
- // Loads can only be lowered if the value is known.
- if (!CurVal) {
- DeferredLoads.push_back(cast<LoadInst>(Inst));
- return nullptr;
- }
-
- Value *Index = calculateVectorIndex(
- cast<LoadInst>(Inst)->getPointerOperand(), GEPVectorIdx);
+ Value *CurVal = GetCurVal();
+ Value *Index =
+ calculateVectorIndex(cast<LoadInst>(Inst)->getPointerOperand(), AA);
// We're loading the full vector.
Type *AccessTy = Inst->getType();
TypeSize AccessSize = DL.getTypeStoreSize(AccessTy);
if (Constant *CI = dyn_cast<Constant>(Index)) {
if (CI->isZeroValue() && AccessSize == VecStoreSize) {
- if (AccessTy->isPtrOrPtrVectorTy())
- CurVal = CreateTempPtrIntCast(CurVal, AccessTy);
- else if (CurVal->getType()->isPtrOrPtrVectorTy())
- CurVal = CreateTempPtrIntCast(CurVal, CurVal->getType());
- Value *NewVal = Builder.CreateBitOrPointerCast(CurVal, AccessTy);
- Inst->replaceAllUsesWith(NewVal);
+ Inst->replaceAllUsesWith(
+ Builder.CreateBitPreservingCastChain(DL, CurVal, AccessTy));
return nullptr;
}
}
@@ -572,6 +632,36 @@ static Value *promoteAllocaUserToVector(
auto *SubVecTy = FixedVectorType::get(VecEltTy, NumLoadedElts);
assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
+ // If idx is dynamic, then sandwich load with bitcasts.
+ // ie. VectorTy SubVecTy AccessTy
+ // <64 x i8> -> <16 x i8> <8 x i16>
+ // <64 x i8> -> <4 x i128> -> i128 -> <8 x i16>
+ // Extracting subvector with dynamic index has very large expansion in
+ // the amdgpu backend. Limit to pow2.
+ FixedVectorType *VectorTy = AA.Vector.Ty;
+ TypeSize NumBits = DL.getTypeStoreSize(SubVecTy) * 8u;
+ uint64_t LoadAlign = cast<LoadInst>(Inst)->getAlign().value();
+ bool IsAlignedLoad = NumBits <= (LoadAlign * 8u);
+ unsigned TotalNumElts = VectorTy->getNumElements();
+ bool IsProperlyDivisible = TotalNumElts % NumLoadedElts == 0;
+ if (!isa<ConstantInt>(Index) &&
+ llvm::isPowerOf2_32(SubVecTy->getNumElements()) &&
+ IsProperlyDivisible && IsAlignedLoad) {
+ IntegerType *NewElemTy = Builder.getIntNTy(NumBits);
+ const unsigned NewNumElts =
+ DL.getTypeStoreSize(VectorTy) * 8u / NumBits;
+ const unsigned LShrAmt = llvm::Log2_32(SubVecTy->getNumElements());
+ FixedVectorType *BitCastTy =
+ FixedVectorType::get(NewElemTy, NewNumElts);
+ Value *BCVal = Builder.CreateBitCast(CurVal, BitCastTy);
+ Value *NewIdx = Builder.CreateLShr(
+ Index, ConstantInt::get(Index->getType(), LShrAmt));
+ Value *ExtVal = Builder.CreateExtractElement(BCVal, NewIdx);
+ Value *BCOut = Builder.CreateBitCast(ExtVal, AccessTy);
+ Inst->replaceAllUsesWith(BCOut);
+ return nullptr;
+ }
+
Value *SubVec = PoisonValue::get(SubVecTy);
for (unsigned K = 0; K < NumLoadedElts; ++K) {
Value *CurIdx =
@@ -580,13 +670,8 @@ static Value *promoteAllocaUserToVector(
SubVec, Builder.CreateExtractElement(CurVal, CurIdx), K);
}
- if (AccessTy->isPtrOrPtrVectorTy())
- SubVec = CreateTempPtrIntCast(SubVec, AccessTy);
- else if (SubVecTy->isPtrOrPtrVectorTy())
- SubVec = CreateTempPtrIntCast(SubVec, SubVecTy);
-
- SubVec = Builder.CreateBitOrPointerCast(SubVec, AccessTy);
- Inst->replaceAllUsesWith(SubVec);
+ Inst->replaceAllUsesWith(
+ Builder.CreateBitPreservingCastChain(DL, SubVec, AccessTy));
return nullptr;
}
@@ -604,39 +689,27 @@ static Value *promoteAllocaUserToVector(
// to know the current value. If this is a store of a single element, we
// need to know the value.
StoreInst *SI = cast<StoreInst>(Inst);
- Value *Index = calculateVectorIndex(SI->getPointerOperand(), GEPVectorIdx);
+ Value *Index = calculateVectorIndex(SI->getPointerOperand(), AA);
Value *Val = SI->getValueOperand();
// We're storing the full vector, we can handle this without knowing CurVal.
Type *AccessTy = Val->getType();
TypeSize AccessSize = DL.getTypeStoreSize(AccessTy);
- if (Constant *CI = dyn_cast<Constant>(Index)) {
- if (CI->isZeroValue() && AccessSize == VecStoreSize) {
- if (AccessTy->isPtrOrPtrVectorTy())
- Val = CreateTempPtrIntCast(Val, AccessTy);
- else if (VectorTy->isPtrOrPtrVectorTy())
- Val = CreateTempPtrIntCast(Val, VectorTy);
- return Builder.CreateBitOrPointerCast(Val, VectorTy);
- }
- }
+ if (Constant *CI = dyn_cast<Constant>(Index))
+ if (CI->isZeroValue() && AccessSize == VecStoreSize)
+ return Builder.CreateBitPreservingCastChain(DL, Val, AA.Vector.Ty);
// Storing a subvector.
if (isa<FixedVectorType>(AccessTy)) {
assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
const unsigned NumWrittenElts =
AccessSize / DL.getTypeStoreSize(VecEltTy);
- const unsigned NumVecElts = VectorTy->getNumElements();
+ const unsigned NumVecElts = AA.Vector.Ty->getNumElements();
auto *SubVecTy = FixedVectorType::get(VecEltTy, NumWrittenElts);
assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
- if (SubVecTy->isPtrOrPtrVectorTy())
- Val = CreateTempPtrIntCast(Val, SubVecTy);
- else if (AccessTy->isPtrOrPtrVectorTy())
- Val = CreateTempPtrIntCast(Val, AccessTy);
-
- Val = Builder.CreateBitOrPointerCast(Val, SubVecTy);
-
- Value *CurVec = GetOrLoadCurrentVectorValue();
+ Val = Builder.CreateBitPreservingCastChain(DL, Val, SubVecTy);
+ Value *CurVec = GetCurVal();
for (unsigned K = 0, NumElts = std::min(NumWrittenElts, NumVecElts);
K < NumElts; ++K) {
Value *CurIdx =
@@ -649,22 +722,21 @@ static Value *promoteAllocaUserToVector(
if (Val->getType() != VecEltTy)
Val = Builder.CreateBitOrPointerCast(Val, VecEltTy);
- return Builder.CreateInsertElement(GetOrLoadCurrentVectorValue(), Val,
- Index);
+ return Builder.CreateInsertElement(GetCurVal(), Val, Index);
}
case Instruction::Call: {
if (auto *MTI = dyn_cast<MemTransferInst>(Inst)) {
// For memcpy, we need to know curval.
ConstantInt *Length = cast<ConstantInt>(MTI->getLength());
unsigned NumCopied = Length->getZExtValue() / ElementSize;
- MemTransferInfo *TI = &TransferInfo[MTI];
+ MemTransferInfo *TI = &AA.Vector.TransferInfo[MTI];
unsigned SrcBegin = TI->SrcIndex->getZExtValue();
unsigned DestBegin = TI->DestIndex->getZExtValue();
SmallVector<int> Mask;
- for (unsigned Idx = 0; Idx < VectorTy->getNumElements(); ++Idx) {
+ for (unsigned Idx = 0; Idx < AA.Vector.Ty->getNumElements(); ++Idx) {
if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
- Mask.push_back(SrcBegin < VectorTy->getNumElements()
+ Mask.push_back(SrcBegin < AA.Vector.Ty->getNumElements()
? SrcBegin++
: PoisonMaskElem);
} else {
@@ -672,7 +744,7 @@ static Value *promoteAllocaUserToVector(
}
}
- return Builder.CreateShuffleVector(GetOrLoadCurrentVectorValue(), Mask);
+ return Builder.CreateShuffleVector(GetCurVal(), Mask);
}
if (auto *MSI = dyn_cast<MemSetInst>(Inst)) {
@@ -693,14 +765,14 @@ static Value *promoteAllocaUserToVector(
Elt = Builder.CreateBitCast(EltBytes, VecEltTy);
}
- return Builder.CreateVectorSplat(VectorTy->getElementCount(), Elt);
+ return Builder.CreateVectorSplat(AA.Vector.Ty->getElementCount(), Elt);
}
if (auto *Intr = dyn_cast<IntrinsicInst>(Inst)) {
if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
Intr->replaceAllUsesWith(
Builder.getIntN(Intr->getType()->getIntegerBitWidth(),
- DL.getTypeAllocSize(VectorTy)));
+ DL.getTypeAllocSize(AA.Vector.Ty)));
return nullptr;
}
}
@@ -791,16 +863,13 @@ static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB,
return I;
}
-// FIXME: Should try to pick the most likely to be profitable allocas first.
-bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
- LLVM_DEBUG(dbgs() << "Trying to promote to vector: " << Alloca << '\n');
-
+FixedVectorType *
+AMDGPUPromoteAllocaImpl::getVectorTypeForAlloca(Type *AllocaTy) const {
if (DisablePromoteAllocaToVector) {
- LLVM_DEBUG(dbgs() << " Promote alloca to vector is disabled\n");
- return false;
+ LLVM_DEBUG(dbgs() << " Promote alloca to vectors is disabled\n");
+ return nullptr;
}
- Type *AllocaTy = Alloca.getAllocatedType();
auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
uint64_t NumElems = 1;
@@ -832,10 +901,9 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
}
}
}
-
if (!VectorTy) {
LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");
- return false;
+ return nullptr;
}
const unsigned MaxElements =
@@ -845,46 +913,46 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
VectorTy->getNumElements() < 2) {
LLVM_DEBUG(dbgs() << " " << *VectorTy
<< " has an unsupported number of elements\n");
- return false;
+ return nullptr;
}
- std::map<GetElementPtrInst *, WeakTrackingVH> GEPVectorIdx;
- SmallVector<Instruction *> WorkList;
- SmallVector<Instruction *> UsersToRemove;
- SmallVector<Instruction *> DeferredInsts;
- SmallVector<Instruction *> NewGEPInsts;
- DenseMap<MemTransferInst *, MemTransferInfo> TransferInfo;
-
- const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
- LLVM_DEBUG(dbgs() << " Cannot promote alloca to vector: " << Msg << "\n"
- << " " << *Inst << "\n");
- for (auto *Inst : reverse(NewGEPInsts))
- Inst->eraseFromParent();
- return false;
- };
-
- SmallVector<Use *, 8> Uses;
- collectAllocaUses(Alloca, Uses);
-
- LLVM_DEBUG(dbgs() << " Attempting promotion to: " << *VectorTy << "\n");
-
Type *VecEltTy = VectorTy->getElementType();
unsigned ElementSizeInBits = DL->getTypeSizeInBits(VecEltTy);
if (ElementSizeInBits != DL->getTypeAllocSizeInBits(VecEltTy)) {
LLVM_DEBUG(dbgs() << " Cannot convert to vector if the allocation size "
"does not match the type's size\n");
- return false;
+ return nullptr;
}
- unsigned ElementSize = ElementSizeInBits / 8;
+
+ return VectorTy;
+}
+
+void AMDGPUPromoteAllocaImpl::analyzePromoteToVector(AllocaAnalysis &AA) const {
+ if (AA.HaveSelectOrPHI) {
+ LLVM_DEBUG(dbgs() << " Cannot convert to vector due to select or phi\n");
+ return;
+ }
+
+ Type *AllocaTy = AA.Alloca->getAllocatedType();
+ AA.Vector.Ty = getVectorTypeForAlloca(AllocaTy);
+ if (!AA.Vector.Ty)
+ return;
+
+ const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
+ LLVM_DEBUG(dbgs() << " Cannot promote alloca to vector: " << Msg << "\n"
+ << " " << *Inst << "\n");
+ AA.Vector.Ty = nullptr;
+ };
+
+ Type *VecEltTy = AA.Vector.Ty->getElementType();
+ unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8;
assert(ElementSize > 0);
- for (auto *U : Uses) {
+ for (auto *U : AA.Uses) {
Instruction *Inst = cast<Instruction>(U->getUser());
if (Value *Ptr = getLoadStorePointerOperand(Inst)) {
- // This is a store of the pointer, not to the pointer.
- if (isa<StoreInst>(Inst) &&
- U->getOperandNo() != StoreInst::getPointerOperandIndex())
- return RejectUser(Inst, "pointer is being stored");
+ assert(!isa<StoreInst>(Inst) ||
+ U->getOperandNo() == StoreInst::getPointerOperandIndex());
Type *AccessTy = getLoadStoreType(Inst);
if (AccessTy->isAggregateType())
@@ -900,34 +968,35 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
Ptr = Ptr->stripPointerCasts();
// Alloca already accessed as vector.
- if (Ptr == &Alloca && DL->getTypeStoreSize(Alloca.getAllocatedType()) ==
- DL->getTypeStoreSize(AccessTy)) {
- WorkList.push_back(Inst);
+ if (Ptr == AA.Alloca &&
+ DL->getTypeStoreSize(AA.Alloca->getAllocatedType()) ==
+ DL->getTypeStoreSize(AccessTy)) {
+ AA.Vector.Worklist.push_back(Inst);
continue;
}
- if (!isSupportedAccessType(VectorTy, AccessTy, *DL))
+ if (!isSupportedAccessType(AA.Vector.Ty, AccessTy, *DL))
return RejectUser(Inst, "not a supported access type");
- WorkList.push_back(Inst);
+ AA.Vector.Worklist.push_back(Inst);
continue;
}
if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
// If we can't compute a vector index from this GEP, then we can't
// promote this alloca to vector.
- Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL, NewGEPInsts);
+ auto Index = computeGEPToVectorIndex(GEP, AA.Alloca, VecEltTy, *DL);
if (!Index)
return RejectUser(Inst, "cannot compute vector index for GEP");
- GEPVectorIdx[GEP] = Index;
- UsersToRemove.push_back(Inst);
+ AA.Vector.GEPVectorIdx[GEP] = std::move(Index.value());
+ AA.Vector.UsersToRemove.push_back(Inst);
continue;
}
if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst);
- MSI && isSupportedMemset(MSI, &Alloca, *DL)) {
- WorkList.push_back(Inst);
+ MSI && isSupportedMemset(MSI, AA.Alloca, *DL)) {
+ AA.Vector.Worklist.push_back(Inst);
continue;
}
@@ -940,31 +1009,32 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
return RejectUser(Inst, "mem transfer inst length is non-constant or "
"not a multiple of the vector element size");
- if (TransferInfo.try_emplace(TransferInst).second) {
- DeferredInsts.push_back(Inst);
- WorkList.push_back(Inst);
- }
+ auto getConstIndexIntoAlloca = [&](Value *Ptr) -> ConstantInt * {
+ if (Ptr == AA.Alloca)
+ return ConstantInt::get(Ptr->getContext(), APInt(32, 0));
- auto getPointerIndexOfAlloca = [&](Value *Ptr) -> ConstantInt * {
- GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
- if (Ptr != &Alloca && !GEPVectorIdx.count(GEP))
+ GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
+ const auto &GEPI = AA.Vector.GEPVectorIdx.find(GEP)->second;
+ if (GEPI.VarIndex)
return nullptr;
-
- return dyn_cast<ConstantInt>(calculateVectorIndex(Ptr, GEPVectorIdx));
+ if (GEPI.ConstIndex)
+ return GEPI.ConstIndex;
+ return ConstantInt::get(Ptr->getContext(), APInt(32, 0));
};
+ MemTransferInfo *TI =
+ &AA.Vector.TransferInfo.try_emplace(TransferInst).first->second;
unsigned OpNum = U->getOperandNo();
- MemTransferInfo *TI = &TransferInfo[TransferInst];
if (OpNum == 0) {
Value *Dest = TransferInst->getDest();
- ConstantInt *Index = getPointerIndexOfAlloca(Dest);
+ ConstantInt *Index = getConstIndexIntoAlloca(Dest);
if (!Index)
return RejectUser(Inst, "could not calculate constant dest index");
TI->DestIndex = Index;
} else {
assert(OpNum == 1);
Value *Src = TransferInst->getSource();
- ConstantInt *Index = getPointerIndexOfAlloca(Src);
+ ConstantInt *Index = getConstIndexIntoAlloca(Src);
if (!Index)
return RejectUser(Inst, "could not calculate constant src index");
TI->SrcIndex = Index;
@@ -974,7 +1044,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
if (auto *Intr = dyn_cast<IntrinsicInst>(Inst)) {
if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
- WorkList.push_back(Inst);
+ AA.Vector.Worklist.push_back(Inst);
continue;
}
}
@@ -983,97 +1053,114 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
if (isAssumeLikeIntrinsic(Inst)) {
if (!Inst->use_empty())
return RejectUser(Inst, "assume-like intrinsic cannot have any users");
- UsersToRemove.push_back(Inst);
+ AA.Vector.UsersToRemove.push_back(Inst);
continue;
}
if (isa<ICmpInst>(Inst) && all_of(Inst->users(), [](User *U) {
return isAssumeLikeIntrinsic(cast<Instruction>(U));
})) {
- UsersToRemove.push_back(Inst);
+ AA.Vector.UsersToRemove.push_back(Inst);
continue;
}
return RejectUser(Inst, "unhandled alloca user");
}
- while (!DeferredInsts.empty()) {
- Instruction *Inst = DeferredInsts.pop_back_val();
- MemTransferInst *TransferInst = cast<MemTransferInst>(Inst);
- // TODO: Support the case if the pointers are from different alloca or
- // from different address spaces.
- MemTransferInfo &Info = TransferInfo[TransferInst];
- if (!Info.SrcIndex || !Info.DestIndex)
- return RejectUser(
- Inst, "mem transfer inst is missing constant src and/or dst index");
+ // Follow-up check to ensure we've seen both sides of all transfer insts.
+ for (const auto &Entry : AA.Vector.TransferInfo) {
+ const MemTransferInfo &TI = Entry.second;
+ if (!TI.SrcIndex || !TI.DestIndex)
+ return RejectUser(Entry.first,
+ "mem transfer inst between different objects");
+ AA.Vector.Worklist.push_back(Entry.first);
}
+}
- LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> "
- << *VectorTy << '\n');
- const unsigned VecStoreSize = DL->getTypeStoreSize(VectorTy);
+void AMDGPUPromoteAllocaImpl::promoteAllocaToVector(AllocaAnalysis &AA) {
+ LLVM_DEBUG(dbgs() << "Promoting to vectors: " << *AA.Alloca << '\n');
+ LLVM_DEBUG(dbgs() << " type conversion: " << *AA.Alloca->getAllocatedType()
+ << " -> " << *AA.Vector.Ty << '\n');
+ const unsigned VecStoreSize = DL->getTypeStoreSize(AA.Vector.Ty);
+
+ Type *VecEltTy = AA.Vector.Ty->getElementType();
+ const unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8;
// Alloca is uninitialized memory. Imitate that by making the first value
// undef.
SSAUpdater Updater;
- Updater.Initialize(VectorTy, "promotealloca");
+ Updater.Initialize(AA.Vector.Ty, "promotealloca");
- BasicBlock *EntryBB = Alloca.getParent();
+ BasicBlock *EntryBB = AA.Alloca->getParent();
BasicBlock::iterator InitInsertPos =
- skipToNonAllocaInsertPt(*EntryBB, Alloca.getIterator());
- // Alloca memory is undefined to begin, not poison.
- Value *AllocaInitValue =
- new FreezeInst(PoisonValue::get(VectorTy), "", InitInsertPos);
- AllocaInitValue->takeName(&Alloca);
+ skipToNonAllocaInsertPt(*EntryBB, AA.Alloca->getIterator());
+ IRBuilder<> Builder(&*InitInsertPos);
+ Value *AllocaInitValue = Builder.CreateFreeze(PoisonValue::get(AA.Vector.Ty));
+ AllocaInitValue->takeName(AA.Alloca);
- Updater.AddAvailableValue(EntryBB, AllocaInitValue);
+ Updater.AddAvailableValue(AA.Alloca->getParent(), AllocaInitValue);
- // First handle the initial worklist.
- SmallVector<LoadInst *, 4> DeferredLoads;
- forEachWorkListItem(WorkList, [&](Instruction *I) {
+ // First handle the initial worklist, in basic block order.
+ //
+ // Insert a placeholder whenever we need the vector value at the top of a
+ // basic block.
+ SmallVector<Instruction *> Placeholders;
+ forEachWorkListItem(AA.Vector.Worklist, [&](Instruction *I) {
BasicBlock *BB = I->getParent();
- // On the first pass, we only take values that are trivially known, i.e.
- // where AddAvailableValue was already called in this block.
- Value *Result = promoteAllocaUserToVector(
- I, *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx,
- Updater.FindValueForBlock(BB), DeferredLoads);
+ auto GetCurVal = [&]() -> Value * {
+ if (Value *CurVal = Updater.FindValueForBlock(BB))
+ return CurVal;
+
+ if (!Placeholders.empty() && Placeholders.back()->getParent() == BB)
+ return Placeholders.back();
+
+ // If the current value in the basic block is not yet known, insert a
+ // placeholder that we will replace later.
+ IRBuilder<> Builder(I);
+ auto *Placeholder = cast<Instruction>(Builder.CreateFreeze(
+ PoisonValue::get(AA.Vector.Ty), "promotealloca.placeholder"));
+ Placeholders.push_back(Placeholder);
+ return Placeholders.back();
+ };
+
+ Value *Result = promoteAllocaUserToVector(I, *DL, AA, VecStoreSize,
+ ElementSize, GetCurVal);
if (Result)
Updater.AddAvailableValue(BB, Result);
});
- // Then handle deferred loads.
- forEachWorkListItem(DeferredLoads, [&](Instruction *I) {
- SmallVector<LoadInst *, 0> NewDLs;
- BasicBlock *BB = I->getParent();
- // On the second pass, we use GetValueInMiddleOfBlock to guarantee we always
- // get a value, inserting PHIs as needed.
- Value *Result = promoteAllocaUserToVector(
- I, *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx,
- Updater.GetValueInMiddleOfBlock(I->getParent()), NewDLs);
- if (Result)
- Updater.AddAvailableValue(BB, Result);
- assert(NewDLs.empty() && "No more deferred loads should be queued!");
- });
+ // Now fixup the placeholders.
+ SmallVector<Value *> PlaceholderToNewVal(Placeholders.size());
+ for (auto [Index, Placeholder] : enumerate(Placeholders)) {
+ Value *NewVal = Updater.GetValueInMiddleOfBlock(Placeholder->getParent());
+ PlaceholderToNewVal[Index] = NewVal;
+ Placeholder->replaceAllUsesWith(NewVal);
+ }
+ // Note: we cannot merge this loop with the previous one because it is
+ // possible that the placeholder itself can be used in the SSAUpdater. The
+ // replaceAllUsesWith doesn't replace those uses.
+ for (auto [Index, Placeholder] : enumerate(Placeholders)) {
+ if (!Placeholder->use_empty())
+ Placeholder->replaceAllUsesWith(PlaceholderToNewVal[Index]);
+ Placeholder->eraseFromParent();
+ }
- // Delete all instructions. On the first pass, new dummy loads may have been
- // added so we need to collect them too.
- DenseSet<Instruction *> InstsToDelete(WorkList.begin(), WorkList.end());
- InstsToDelete.insert_range(DeferredLoads);
- for (Instruction *I : InstsToDelete) {
+ // Delete all instructions.
+ for (Instruction *I : AA.Vector.Worklist) {
assert(I->use_empty());
I->eraseFromParent();
}
// Delete all the users that are known to be removeable.
- for (Instruction *I : reverse(UsersToRemove)) {
+ for (Instruction *I : reverse(AA.Vector.UsersToRemove)) {
I->dropDroppableUses();
assert(I->use_empty());
I->eraseFromParent();
}
// Alloca should now be dead too.
- assert(Alloca.use_empty());
- Alloca.eraseFromParent();
- return true;
+ assert(AA.Alloca->use_empty());
+ AA.Alloca->eraseFromParent();
}
std::pair<Value *, Value *>
@@ -1247,61 +1334,78 @@ bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca(
return true;
}
-bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes(
- Value *BaseAlloca, Value *Val, std::vector<Value *> &WorkList) const {
+void AMDGPUPromoteAllocaImpl::analyzePromoteToLDS(AllocaAnalysis &AA) const {
+ if (DisablePromoteAllocaToLDS) {
+ LLVM_DEBUG(dbgs() << " Promote alloca to LDS is disabled\n");
+ return;
+ }
- for (User *User : Val->users()) {
- if (is_contained(WorkList, User))
- continue;
+ // Don't promote the alloca to LDS for shader calling conventions as the work
+ // item ID intrinsics are not supported for these calling conventions.
+ // Furthermore not all LDS is available for some of the stages.
+ const Function &ContainingFunction = *AA.Alloca->getFunction();
+ CallingConv::ID CC = ContainingFunction.getCallingConv();
+
+ switch (CC) {
+ case CallingConv::AMDGPU_KERNEL:
+ case CallingConv::SPIR_KERNEL:
+ break;
+ default:
+ LLVM_DEBUG(
+ dbgs()
+ << " promote alloca to LDS not supported with calling convention.\n");
+ return;
+ }
+
+ for (Use *Use : AA.Uses) {
+ auto *User = Use->getUser();
if (CallInst *CI = dyn_cast<CallInst>(User)) {
if (!isCallPromotable(CI))
- return false;
+ return;
- WorkList.push_back(User);
+ if (find(AA.LDS.Worklist, User) == AA.LDS.Worklist.end())
+ AA.LDS.Worklist.push_back(User);
continue;
}
Instruction *UseInst = cast<Instruction>(User);
if (UseInst->getOpcode() == Instruction::PtrToInt)
- return false;
+ return;
if (LoadInst *LI = dyn_cast<LoadInst>(UseInst)) {
if (LI->isVolatile())
- return false;
+ return;
continue;
}
if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) {
if (SI->isVolatile())
- return false;
-
- // Reject if the stored value is not the pointer operand.
- if (SI->getPointerOperand() != Val)
- return false;
+ return;
continue;
}
if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UseInst)) {
if (RMW->isVolatile())
- return false;
+ return;
continue;
}
if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(UseInst)) {
if (CAS->isVolatile())
- return false;
+ return;
continue;
}
// Only promote a select if we know that the other select operand
// is from another pointer that will also be promoted.
if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
- if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1))
- return false;
+ if (!binaryOpIsDerivedFromSameAlloca(AA.Alloca, Use->get(), ICmp, 0, 1))
+ return;
// May need to rewrite constant operands.
- WorkList.push_back(ICmp);
+ if (find(AA.LDS.Worklist, User) == AA.LDS.Worklist.end())
+ AA.LDS.Worklist.push_back(ICmp);
continue;
}
@@ -1309,28 +1413,8 @@ bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes(
// Be conservative if an address could be computed outside the bounds of
// the alloca.
if (!GEP->isInBounds())
- return false;
- } else if (SelectInst *SI = dyn_cast<SelectInst>(UseInst)) {
- // Only promote a select if we know that the other select operand is from
- // another pointer that will also be promoted.
- if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2))
- return false;
- } else if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) {
- // Repeat for phis.
-
- // TODO: Handle more complex cases. We should be able to replace loops
- // over arrays.
- switch (Phi->getNumIncomingValues()) {
- case 1:
- break;
- case 2:
- if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1))
- return false;
- break;
- default:
- return false;
- }
- } else if (!isa<ExtractElementInst>(User)) {
+ return;
+ } else if (!isa<ExtractElementInst, SelectInst, PHINode>(User)) {
// Do not promote vector/aggregate type instructions. It is hard to track
// their users.
@@ -1338,15 +1422,14 @@ bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes(
//
// TODO: If we know the address is only observed through flat pointers, we
// could still promote.
- return false;
+ return;
}
- WorkList.push_back(User);
- if (!collectUsesWithPtrTypes(BaseAlloca, User, WorkList))
- return false;
+ if (find(AA.LDS.Worklist, User) == AA.LDS.Worklist.end())
+ AA.LDS.Worklist.push_back(User);
}
- return true;
+ AA.LDS.Enable = true;
}
bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
@@ -1378,7 +1461,7 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
auto visitUsers = [&](const GlobalVariable *GV, const Constant *Val) -> bool {
for (const User *U : Val->users()) {
if (const Instruction *Use = dyn_cast<Instruction>(U)) {
- if (Use->getParent()->getParent() == &F)
+ if (Use->getFunction() == &F)
return true;
} else {
const Constant *C = cast<Constant>(U);
@@ -1419,7 +1502,7 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
for (const GlobalVariable *GV : UsedLDS) {
Align Alignment =
DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType());
- uint64_t AllocSize = DL.getTypeAllocSize(GV->getValueType());
+ uint64_t AllocSize = GV->getGlobalSize(DL);
// HIP uses an extern unsized array in local address space for dynamically
// allocated shared memory. In that case, we have to disable the promotion.
@@ -1477,44 +1560,24 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
}
// FIXME: Should try to pick the most likely to be profitable allocas first.
-bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
- bool SufficientLDS) {
- LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << I << '\n');
-
- if (DisablePromoteAllocaToLDS) {
- LLVM_DEBUG(dbgs() << " Promote alloca to LDS is disabled\n");
- return false;
- }
-
- const DataLayout &DL = Mod->getDataLayout();
- IRBuilder<> Builder(&I);
-
- const Function &ContainingFunction = *I.getParent()->getParent();
- CallingConv::ID CC = ContainingFunction.getCallingConv();
-
- // Don't promote the alloca to LDS for shader calling conventions as the work
- // item ID intrinsics are not supported for these calling conventions.
- // Furthermore not all LDS is available for some of the stages.
- switch (CC) {
- case CallingConv::AMDGPU_KERNEL:
- case CallingConv::SPIR_KERNEL:
- break;
- default:
- LLVM_DEBUG(
- dbgs()
- << " promote alloca to LDS not supported with calling convention.\n");
- return false;
- }
+bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(
+ AllocaAnalysis &AA, bool SufficientLDS,
+ SetVector<IntrinsicInst *> &DeferredIntrs) {
+ LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << *AA.Alloca << '\n');
// Not likely to have sufficient local memory for promotion.
if (!SufficientLDS)
return false;
+ const DataLayout &DL = Mod->getDataLayout();
+ IRBuilder<> Builder(AA.Alloca);
+
+ const Function &ContainingFunction = *AA.Alloca->getParent()->getParent();
const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, ContainingFunction);
unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
- Align Alignment =
- DL.getValueOrABITypeAlignment(I.getAlign(), I.getAllocatedType());
+ Align Alignment = DL.getValueOrABITypeAlignment(
+ AA.Alloca->getAlign(), AA.Alloca->getAllocatedType());
// FIXME: This computed padding is likely wrong since it depends on inverse
// usage order.
@@ -1524,7 +1587,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
uint32_t NewSize = alignTo(CurrentLocalMemUsage, Alignment);
uint32_t AllocSize =
- WorkGroupSize * DL.getTypeAllocSize(I.getAllocatedType());
+ WorkGroupSize * DL.getTypeAllocSize(AA.Alloca->getAllocatedType());
NewSize += AllocSize;
if (NewSize > LocalMemLimit) {
@@ -1535,24 +1598,17 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
CurrentLocalMemUsage = NewSize;
- std::vector<Value *> WorkList;
-
- if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
- LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n");
- return false;
- }
-
LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n");
- Function *F = I.getParent()->getParent();
+ Function *F = AA.Alloca->getFunction();
- Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize);
+ Type *GVTy = ArrayType::get(AA.Alloca->getAllocatedType(), WorkGroupSize);
GlobalVariable *GV = new GlobalVariable(
*Mod, GVTy, false, GlobalValue::InternalLinkage, PoisonValue::get(GVTy),
- Twine(F->getName()) + Twine('.') + I.getName(), nullptr,
+ Twine(F->getName()) + Twine('.') + AA.Alloca->getName(), nullptr,
GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
- GV->setAlignment(I.getAlign());
+ GV->setAlignment(AA.Alloca->getAlign());
Value *TCntY, *TCntZ;
@@ -1571,15 +1627,13 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
Value *Indices[] = {Constant::getNullValue(Type::getInt32Ty(Context)), TID};
Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
- I.mutateType(Offset->getType());
- I.replaceAllUsesWith(Offset);
- I.eraseFromParent();
-
- SmallVector<IntrinsicInst *> DeferredIntrs;
+ AA.Alloca->mutateType(Offset->getType());
+ AA.Alloca->replaceAllUsesWith(Offset);
+ AA.Alloca->eraseFromParent();
PointerType *NewPtrTy = PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS);
- for (Value *V : WorkList) {
+ for (Value *V : AA.LDS.Worklist) {
CallInst *Call = dyn_cast<CallInst>(V);
if (!Call) {
if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
@@ -1637,7 +1691,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
// These have 2 pointer operands. In case if second pointer also needs
// to be replaced we defer processing of these intrinsics until all
// other values are processed.
- DeferredIntrs.push_back(Intr);
+ DeferredIntrs.insert(Intr);
continue;
case Intrinsic::memset: {
MemSetInst *MemSet = cast<MemSetInst>(Intr);
@@ -1685,7 +1739,14 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
}
}
+ return true;
+}
+
+void AMDGPUPromoteAllocaImpl::finishDeferredAllocaToLDSPromotion(
+ SetVector<IntrinsicInst *> &DeferredIntrs) {
+
for (IntrinsicInst *Intr : DeferredIntrs) {
+ IRBuilder<> Builder(Intr);
Builder.SetInsertPoint(Intr);
Intrinsic::ID ID = Intr->getIntrinsicID();
assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove);
@@ -1703,6 +1764,4 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
Intr->eraseFromParent();
}
-
- return true;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index fd604e1..e2e84ce 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -333,7 +333,7 @@ bool AMDGPURegBankCombinerImpl::matchFPMed3ToClamp(MachineInstr &MI,
Register Val = Src0->getOperand(0).getReg();
auto isOp3Zero = [&]() {
- MachineInstr *Op3 = getDefIgnoringCopies(MI.getOperand(4).getReg(), MRI);
+ MachineInstr *Op3 = getDefIgnoringCopies(MI.getOperand(3).getReg(), MRI);
if (Op3->getOpcode() == TargetOpcode::G_FCONSTANT)
return Op3->getOperand(1).getFPImm()->isExactlyValue(0.0);
return false;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
index e187959..888717f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
@@ -24,6 +24,7 @@
#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
@@ -34,9 +35,17 @@
using namespace llvm;
using namespace AMDGPU;
+using namespace llvm::MIPatternMatch;
namespace {
+// AMDGPU-specific pattern matchers
+template <typename SrcTy>
+inline UnaryOp_match<SrcTy, AMDGPU::G_AMDGPU_READANYLANE>
+m_GAMDGPUReadAnyLane(const SrcTy &Src) {
+ return UnaryOp_match<SrcTy, AMDGPU::G_AMDGPU_READANYLANE>(Src);
+}
+
class AMDGPURegBankLegalize : public MachineFunctionPass {
public:
static char ID;
@@ -119,8 +128,9 @@ public:
bool isLaneMask(Register Reg);
std::pair<MachineInstr *, Register> tryMatch(Register Src, unsigned Opcode);
- std::pair<GUnmerge *, int> tryMatchRALFromUnmerge(Register Src);
- Register getReadAnyLaneSrc(Register Src);
+ Register tryMatchUnmergeDefs(SmallVectorImpl<Register> &DefRegs);
+ SmallVector<Register> tryMatchMergeReadAnyLane(GMergeLikeInstr *Merge);
+ SmallVector<Register> getReadAnyLaneSrcs(Register Src);
void replaceRegWithOrBuildCopy(Register Dst, Register Src);
bool tryEliminateReadAnyLane(MachineInstr &Copy);
@@ -145,43 +155,74 @@ AMDGPURegBankLegalizeCombiner::tryMatch(Register Src, unsigned Opcode) {
return {MatchMI, MatchMI->getOperand(1).getReg()};
}
-std::pair<GUnmerge *, int>
-AMDGPURegBankLegalizeCombiner::tryMatchRALFromUnmerge(Register Src) {
- MachineInstr *ReadAnyLane = MRI.getVRegDef(Src);
- if (ReadAnyLane->getOpcode() != AMDGPU::G_AMDGPU_READANYLANE)
- return {nullptr, -1};
-
- Register RALSrc = ReadAnyLane->getOperand(1).getReg();
- if (auto *UnMerge = getOpcodeDef<GUnmerge>(RALSrc, MRI))
- return {UnMerge, UnMerge->findRegisterDefOperandIdx(RALSrc, nullptr)};
+// Check if all registers are from same unmerge and there is no shuffling.
+// Returns the unmerge source if both conditions are met.
+Register AMDGPURegBankLegalizeCombiner::tryMatchUnmergeDefs(
+ SmallVectorImpl<Register> &DefRegs) {
+ auto *UnMerge = getOpcodeDef<GUnmerge>(DefRegs[0], MRI);
+ if (!UnMerge || UnMerge->getNumDefs() != DefRegs.size())
+ return {};
+ for (unsigned I = 1; I < DefRegs.size(); ++I) {
+ if (UnMerge->getReg(I) != DefRegs[I])
+ return {};
+ }
+ return UnMerge->getSourceReg();
+}
- return {nullptr, -1};
+// Check if all merge sources are readanylanes and return the readanylane
+// sources if they are.
+SmallVector<Register> AMDGPURegBankLegalizeCombiner::tryMatchMergeReadAnyLane(
+ GMergeLikeInstr *Merge) {
+ SmallVector<Register> ReadAnyLaneSrcs;
+ for (unsigned i = 0; i < Merge->getNumSources(); ++i) {
+ Register Src;
+ if (!mi_match(Merge->getSourceReg(i), MRI,
+ m_GAMDGPUReadAnyLane(m_Reg(Src))))
+ return {};
+ ReadAnyLaneSrcs.push_back(Src);
+ }
+ return ReadAnyLaneSrcs;
}
-Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) {
+SmallVector<Register>
+AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrcs(Register Src) {
// Src = G_AMDGPU_READANYLANE RALSrc
- auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE);
- if (RAL)
- return RALSrc;
-
- // LoVgpr, HiVgpr = G_UNMERGE_VALUES UnmergeSrc
- // LoSgpr = G_AMDGPU_READANYLANE LoVgpr
- // HiSgpr = G_AMDGPU_READANYLANE HiVgpr
- // Src G_MERGE_VALUES LoSgpr, HiSgpr
- auto *Merge = getOpcodeDef<GMergeLikeInstr>(Src, MRI);
- if (Merge) {
- unsigned NumElts = Merge->getNumSources();
- auto [Unmerge, Idx] = tryMatchRALFromUnmerge(Merge->getSourceReg(0));
- if (!Unmerge || Unmerge->getNumDefs() != NumElts || Idx != 0)
+ Register RALSrc;
+ if (mi_match(Src, MRI, m_GAMDGPUReadAnyLane(m_Reg(RALSrc))))
+ return {RALSrc};
+
+ // RALSrc = G_ANYEXT S16Src
+ // TruncSrc = G_AMDGPU_READANYLANE RALSrc
+ // Src = G_TRUNC TruncSrc
+ if (mi_match(Src, MRI,
+ m_GTrunc(m_GAMDGPUReadAnyLane(m_GAnyExt(m_Reg(RALSrc)))))) {
+ return {RALSrc};
+ }
+
+ // TruncSrc = G_AMDGPU_READANYLANE RALSrc
+ // AextSrc = G_TRUNC TruncSrc
+ // Src = G_ANYEXT AextSrc
+ if (mi_match(Src, MRI,
+ m_GAnyExt(m_GTrunc(m_GAMDGPUReadAnyLane(m_Reg(RALSrc)))))) {
+ return {RALSrc};
+ }
+
+ // Sgpr0 = G_AMDGPU_READANYLANE Vgpr0
+ // Sgpr1 = G_AMDGPU_READANYLANE Vgpr1
+ // ...
+ // Src = G_MERGE_LIKE Sgpr0, Sgpr1, ...
+ // Dst = COPY Src
+ if (auto *Merge = getOpcodeDef<GMergeLikeInstr>(Src, MRI)) {
+ SmallVector<Register> ReadAnyLaneSrcs = tryMatchMergeReadAnyLane(Merge);
+ if (ReadAnyLaneSrcs.empty())
return {};
- // Check if all elements are from same unmerge and there is no shuffling.
- for (unsigned i = 1; i < NumElts; ++i) {
- auto [UnmergeI, IdxI] = tryMatchRALFromUnmerge(Merge->getSourceReg(i));
- if (UnmergeI != Unmerge || (unsigned)IdxI != i)
- return {};
- }
- return Unmerge->getSourceReg();
+ // Vgpr0, Vgpr1, ... = G_UNMERGE_VALUES UnmergeSrc
+ if (Register UnmergeSrc = tryMatchUnmergeDefs(ReadAnyLaneSrcs))
+ return {UnmergeSrc};
+
+ // Multiple ReadAnyLane vgpr sources, need to merge Vgpr0, Vgpr1, ...
+ return ReadAnyLaneSrcs;
}
// SrcRegIdx = G_AMDGPU_READANYLANE RALElSrc
@@ -192,7 +233,7 @@ Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) {
return {};
int Idx = UnMerge->findRegisterDefOperandIdx(Src, nullptr);
- Merge = getOpcodeDef<GMergeLikeInstr>(UnMerge->getSourceReg(), MRI);
+ auto *Merge = getOpcodeDef<GMergeLikeInstr>(UnMerge->getSourceReg(), MRI);
if (!Merge || UnMerge->getNumDefs() != Merge->getNumSources())
return {};
@@ -202,7 +243,7 @@ Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) {
auto [RALEl, RALElSrc] = tryMatch(SrcRegIdx, AMDGPU::G_AMDGPU_READANYLANE);
if (RALEl)
- return RALElSrc;
+ return {RALElSrc};
return {};
}
@@ -234,17 +275,27 @@ bool AMDGPURegBankLegalizeCombiner::tryEliminateReadAnyLane(
if (SrcMI.getOpcode() == AMDGPU::G_BITCAST)
RALDst = SrcMI.getOperand(1).getReg();
- Register RALSrc = getReadAnyLaneSrc(RALDst);
- if (!RALSrc)
+ B.setInstrAndDebugLoc(Copy);
+ SmallVector<Register> ReadAnyLaneSrcRegs = getReadAnyLaneSrcs(RALDst);
+ if (ReadAnyLaneSrcRegs.empty())
return false;
- B.setInstr(Copy);
+ Register ReadAnyLaneSrc;
+ if (ReadAnyLaneSrcRegs.size() == 1) {
+ ReadAnyLaneSrc = ReadAnyLaneSrcRegs[0];
+ } else {
+ // Multiple readanylane sources without a common unmerge, merge them.
+ auto Merge = B.buildMergeLikeInstr({VgprRB, MRI.getType(RALDst)},
+ ReadAnyLaneSrcRegs);
+ ReadAnyLaneSrc = Merge.getReg(0);
+ }
+
if (SrcMI.getOpcode() != AMDGPU::G_BITCAST) {
// Src = READANYLANE RALSrc Src = READANYLANE RALSrc
// Dst = Copy Src $Dst = Copy Src
// -> ->
// Dst = RALSrc $Dst = Copy RALSrc
- replaceRegWithOrBuildCopy(Dst, RALSrc);
+ replaceRegWithOrBuildCopy(Dst, ReadAnyLaneSrc);
} else {
// RALDst = READANYLANE RALSrc RALDst = READANYLANE RALSrc
// Src = G_BITCAST RALDst Src = G_BITCAST RALDst
@@ -252,7 +303,7 @@ bool AMDGPURegBankLegalizeCombiner::tryEliminateReadAnyLane(
// -> ->
// NewVgpr = G_BITCAST RALDst NewVgpr = G_BITCAST RALDst
// Dst = NewVgpr $Dst = Copy NewVgpr
- auto Bitcast = B.buildBitcast({VgprRB, MRI.getType(Src)}, RALSrc);
+ auto Bitcast = B.buildBitcast({VgprRB, MRI.getType(Src)}, ReadAnyLaneSrc);
replaceRegWithOrBuildCopy(Dst, Bitcast.getReg(0));
}
@@ -410,21 +461,15 @@ bool AMDGPURegBankLegalize::runOnMachineFunction(MachineFunction &MF) {
unsigned Opc = MI->getOpcode();
// Insert point for use operands needs some calculation.
if (Opc == AMDGPU::G_PHI) {
- RBLHelper.applyMappingPHI(*MI);
+ if (!RBLHelper.applyMappingPHI(*MI))
+ return false;
continue;
}
// Opcodes that support pretty much all combinations of reg banks and LLTs
// (except S1). There is no point in writing rules for them.
- if (Opc == AMDGPU::G_BUILD_VECTOR || Opc == AMDGPU::G_UNMERGE_VALUES ||
- Opc == AMDGPU::G_MERGE_VALUES || Opc == AMDGPU::G_BITCAST) {
- RBLHelper.applyMappingTrivial(*MI);
- continue;
- }
-
- // Opcodes that also support S1.
- if (Opc == G_FREEZE &&
- MRI.getType(MI->getOperand(0).getReg()) != LLT::scalar(1)) {
+ if (Opc == AMDGPU::G_BUILD_VECTOR || Opc == AMDGPU::G_MERGE_VALUES ||
+ Opc == AMDGPU::G_CONCAT_VECTORS || Opc == AMDGPU::G_BITCAST) {
RBLHelper.applyMappingTrivial(*MI);
continue;
}
@@ -441,7 +486,8 @@ bool AMDGPURegBankLegalize::runOnMachineFunction(MachineFunction &MF) {
// S1 rules are in RegBankLegalizeRules.
}
- RBLHelper.findRuleAndApplyMapping(*MI);
+ if (!RBLHelper.findRuleAndApplyMapping(*MI))
+ return false;
}
// Sgpr S1 clean up combines:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index 5407566..d262f07 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -19,6 +19,7 @@
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
@@ -32,28 +33,48 @@ using namespace AMDGPU;
RegBankLegalizeHelper::RegBankLegalizeHelper(
MachineIRBuilder &B, const MachineUniformityInfo &MUI,
const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
- : ST(B.getMF().getSubtarget<GCNSubtarget>()), B(B), MRI(*B.getMRI()),
- MUI(MUI), RBI(RBI), RBLRules(RBLRules), IsWave32(ST.isWave32()),
+ : MF(B.getMF()), ST(MF.getSubtarget<GCNSubtarget>()), B(B),
+ MRI(*B.getMRI()), MUI(MUI), RBI(RBI), MORE(MF, nullptr),
+ RBLRules(RBLRules), IsWave32(ST.isWave32()),
SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}
-void RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) {
- const SetOfRulesForOpcode &RuleSet = RBLRules.getRulesForOpc(MI);
- const RegBankLLTMapping &Mapping = RuleSet.findMappingForMI(MI, MRI, MUI);
+bool RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) {
+ const SetOfRulesForOpcode *RuleSet = RBLRules.getRulesForOpc(MI);
+ if (!RuleSet) {
+ reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
+ "No AMDGPU RegBankLegalize rules defined for opcode",
+ MI);
+ return false;
+ }
+
+ const RegBankLLTMapping *Mapping = RuleSet->findMappingForMI(MI, MRI, MUI);
+ if (!Mapping) {
+ reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
+ "AMDGPU RegBankLegalize: none of the rules defined with "
+ "'Any' for MI's opcode matched MI",
+ MI);
+ return false;
+ }
SmallSet<Register, 4> WaterfallSgprs;
unsigned OpIdx = 0;
- if (Mapping.DstOpMapping.size() > 0) {
+ if (Mapping->DstOpMapping.size() > 0) {
B.setInsertPt(*MI.getParent(), std::next(MI.getIterator()));
- applyMappingDst(MI, OpIdx, Mapping.DstOpMapping);
+ if (!applyMappingDst(MI, OpIdx, Mapping->DstOpMapping))
+ return false;
}
- if (Mapping.SrcOpMapping.size() > 0) {
+ if (Mapping->SrcOpMapping.size() > 0) {
B.setInstr(MI);
- applyMappingSrc(MI, OpIdx, Mapping.SrcOpMapping, WaterfallSgprs);
+ if (!applyMappingSrc(MI, OpIdx, Mapping->SrcOpMapping, WaterfallSgprs))
+ return false;
}
- lower(MI, Mapping, WaterfallSgprs);
+ if (!lower(MI, *Mapping, WaterfallSgprs))
+ return false;
+
+ return true;
}
bool RegBankLegalizeHelper::executeInWaterfallLoop(
@@ -274,7 +295,7 @@ bool RegBankLegalizeHelper::executeInWaterfallLoop(
return true;
}
-void RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
+bool RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
ArrayRef<LLT> LLTBreakdown, LLT MergeTy) {
MachineFunction &MF = B.getMF();
assert(MI.getNumMemOperands() == 1);
@@ -322,9 +343,10 @@ void RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
B.buildMergeLikeInstr(Dst, MergeTyParts);
}
MI.eraseFromParent();
+ return true;
}
-void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
+bool RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
LLT MergeTy) {
MachineFunction &MF = B.getMF();
assert(MI.getNumMemOperands() == 1);
@@ -350,9 +372,10 @@ void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
B.buildMergeLikeInstr(Dst, MergeTyParts);
}
MI.eraseFromParent();
+ return true;
}
-void RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &MI) const {
+bool RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &MI) const {
Register Dst = MI.getDstReg();
Register Ptr = MI.getPointerReg();
MachineMemOperand &MMO = MI.getMMO();
@@ -376,9 +399,10 @@ void RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &MI) const {
}
MI.eraseFromParent();
+ return true;
}
-void RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) {
+bool RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) {
Register Dst = MI.getOperand(0).getReg();
LLT Ty = MRI.getType(Dst);
Register Src = MI.getOperand(1).getReg();
@@ -404,15 +428,22 @@ void RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) {
Hi = B.buildUndef({VgprRB_S32});
break;
default:
- llvm_unreachable("Opcode not supported");
+ reportGISelFailure(
+ MF, MORE, "amdgpu-regbanklegalize",
+ "AMDGPU RegBankLegalize: lowerVccExtToSel, Opcode not supported", MI);
+ return false;
}
B.buildMergeValues(Dst, {Lo.getReg(0), Hi.getReg(0)});
} else {
- llvm_unreachable("Type not supported");
+ reportGISelFailure(
+ MF, MORE, "amdgpu-regbanklegalize",
+ "AMDGPU RegBankLegalize: lowerVccExtToSel, Type not supported", MI);
+ return false;
}
MI.eraseFromParent();
+ return true;
}
std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(Register Reg) {
@@ -437,7 +468,14 @@ std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(Register Reg) {
return {Lo.getReg(0), Hi.getReg(0)};
}
-void RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) {
+std::pair<Register, Register>
+RegBankLegalizeHelper::unpackAExtTruncS16(Register Reg) {
+ auto [Lo32, Hi32] = unpackAExt(Reg);
+ return {B.buildTrunc(SgprRB_S16, Lo32).getReg(0),
+ B.buildTrunc(SgprRB_S16, Hi32).getReg(0)};
+}
+
+bool RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) {
Register Lo, Hi;
switch (MI.getOpcode()) {
case AMDGPU::G_SHL: {
@@ -462,13 +500,18 @@ void RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) {
break;
}
default:
- llvm_unreachable("Unpack lowering not implemented");
+ reportGISelFailure(
+ MF, MORE, "amdgpu-regbanklegalize",
+ "AMDGPU RegBankLegalize: lowerUnpackBitShift, case not implemented",
+ MI);
+ return false;
}
B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
MI.eraseFromParent();
+ return true;
}
-void RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) {
+bool RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) {
Register Lo, Hi;
switch (MI.getOpcode()) {
case AMDGPU::G_SMIN:
@@ -494,10 +537,25 @@ void RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) {
break;
}
default:
- llvm_unreachable("Unpack min/max lowering not implemented");
+ reportGISelFailure(
+ MF, MORE, "amdgpu-regbanklegalize",
+ "AMDGPU RegBankLegalize: lowerUnpackMinMax, case not implemented", MI);
+ return false;
}
B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
MI.eraseFromParent();
+ return true;
+}
+
+bool RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &MI) {
+ auto [Op1Lo, Op1Hi] = unpackAExt(MI.getOperand(1).getReg());
+ auto [Op2Lo, Op2Hi] = unpackAExt(MI.getOperand(2).getReg());
+ auto ResLo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Lo, Op2Lo});
+ auto ResHi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Hi, Op2Hi});
+ B.buildBuildVectorTrunc(MI.getOperand(0).getReg(),
+ {ResLo.getReg(0), ResHi.getReg(0)});
+ MI.eraseFromParent();
+ return true;
}
static bool isSignedBFE(MachineInstr &MI) {
@@ -507,7 +565,7 @@ static bool isSignedBFE(MachineInstr &MI) {
return MI.getOpcode() == AMDGPU::G_SBFX;
}
-void RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) {
+bool RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) {
Register Dst = MI.getOperand(0).getReg();
assert(MRI.getType(Dst) == LLT::scalar(64));
bool Signed = isSignedBFE(MI);
@@ -534,7 +592,7 @@ void RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) {
auto SignBit = B.buildShl({VgprRB, S64}, SHRSrc, Amt);
B.buildInstr(SHROpc, {Dst}, {SignBit, Amt});
MI.eraseFromParent();
- return;
+ return true;
}
uint64_t WidthImm = ConstWidth->Value.getZExtValue();
@@ -564,9 +622,10 @@ void RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) {
}
MI.eraseFromParent();
+ return true;
}
-void RegBankLegalizeHelper::lowerS_BFE(MachineInstr &MI) {
+bool RegBankLegalizeHelper::lowerS_BFE(MachineInstr &MI) {
Register DstReg = MI.getOperand(0).getReg();
LLT Ty = MRI.getType(DstReg);
bool Signed = isSignedBFE(MI);
@@ -591,15 +650,15 @@ void RegBankLegalizeHelper::lowerS_BFE(MachineInstr &MI) {
// copies from reg class to reg bank.
auto S_BFE = B.buildInstr(Opc, {{SgprRB, Ty}},
{B.buildCopy(Ty, Src), B.buildCopy(S32, Src1)});
- if (!constrainSelectedInstRegOperands(*S_BFE, *ST.getInstrInfo(),
- *ST.getRegisterInfo(), RBI))
- llvm_unreachable("failed to constrain BFE");
+ constrainSelectedInstRegOperands(*S_BFE, *ST.getInstrInfo(),
+ *ST.getRegisterInfo(), RBI);
B.buildCopy(DstReg, S_BFE->getOperand(0).getReg());
MI.eraseFromParent();
+ return true;
}
-void RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
+bool RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
Register Dst = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(Dst);
assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64);
@@ -614,9 +673,113 @@ void RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(1), Op2.getReg(1)}, Flags);
B.buildMergeLikeInstr(Dst, {Lo, Hi});
MI.eraseFromParent();
+ return true;
}
-void RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) {
+bool RegBankLegalizeHelper::lowerSplitTo32Mul(MachineInstr &MI) {
+ Register Dst = MI.getOperand(0).getReg();
+ assert(MRI.getType(Dst) == S64);
+ auto Op1 = B.buildUnmerge({VgprRB_S32}, MI.getOperand(1).getReg());
+ auto Op2 = B.buildUnmerge({VgprRB_S32}, MI.getOperand(2).getReg());
+
+ // TODO: G_AMDGPU_MAD_* optimizations for G_MUL divergent S64 operation to
+ // match GlobalISel with old regbankselect.
+ auto Lo = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
+ auto Carry = B.buildUMulH(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
+ auto MulLo0Hi1 = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(1));
+ auto MulHi0Lo1 = B.buildMul(VgprRB_S32, Op1.getReg(1), Op2.getReg(0));
+ auto Sum = B.buildAdd(VgprRB_S32, MulLo0Hi1, MulHi0Lo1);
+ auto Hi = B.buildAdd(VgprRB_S32, Sum, Carry);
+
+ B.buildMergeLikeInstr(Dst, {Lo, Hi});
+ MI.eraseFromParent();
+ return true;
+}
+
+bool RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) {
+ Register Dst = MI.getOperand(0).getReg();
+ assert(MRI.getType(Dst) == V2S16);
+ unsigned Opc = MI.getOpcode();
+ unsigned NumOps = MI.getNumOperands();
+ auto Flags = MI.getFlags();
+
+ auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(MI.getOperand(1).getReg());
+
+ if (NumOps == 2) {
+ auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo}, Flags);
+ auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi}, Flags);
+ B.buildMergeLikeInstr(Dst, {Lo, Hi});
+ MI.eraseFromParent();
+ return true;
+ }
+
+ auto [Op2Lo, Op2Hi] = unpackAExtTruncS16(MI.getOperand(2).getReg());
+
+ if (NumOps == 3) {
+ auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo}, Flags);
+ auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi}, Flags);
+ B.buildMergeLikeInstr(Dst, {Lo, Hi});
+ MI.eraseFromParent();
+ return true;
+ }
+
+ assert(NumOps == 4);
+ auto [Op3Lo, Op3Hi] = unpackAExtTruncS16(MI.getOperand(3).getReg());
+ auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo, Op3Lo}, Flags);
+ auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi, Op3Hi}, Flags);
+ B.buildMergeLikeInstr(Dst, {Lo, Hi});
+ MI.eraseFromParent();
+ return true;
+}
+
+bool RegBankLegalizeHelper::lowerUniMAD64(MachineInstr &MI) {
+ Register Dst0 = MI.getOperand(0).getReg();
+ Register Dst1 = MI.getOperand(1).getReg();
+ Register Src0 = MI.getOperand(2).getReg();
+ Register Src1 = MI.getOperand(3).getReg();
+ Register Src2 = MI.getOperand(4).getReg();
+
+ const GCNSubtarget &ST = B.getMF().getSubtarget<GCNSubtarget>();
+
+ // Keep the multiplication on the SALU.
+ Register DstLo = B.buildMul(SgprRB_S32, Src0, Src1).getReg(0);
+ Register DstHi = MRI.createVirtualRegister(SgprRB_S32);
+ if (ST.hasScalarMulHiInsts()) {
+ B.buildInstr(AMDGPU::G_UMULH, {{DstHi}}, {Src0, Src1});
+ } else {
+ auto VSrc0 = B.buildCopy(VgprRB_S32, Src0);
+ auto VSrc1 = B.buildCopy(VgprRB_S32, Src1);
+ auto MulHi = B.buildInstr(AMDGPU::G_UMULH, {VgprRB_S32}, {VSrc0, VSrc1});
+ buildReadAnyLane(B, DstHi, MulHi.getReg(0), RBI);
+ }
+
+ // Accumulate and produce the "carry-out" bit.
+
+ // The "carry-out" is defined as bit 64 of the result when computed as a
+ // big integer. For unsigned multiply-add, this matches the usual
+ // definition of carry-out.
+ if (mi_match(Src2, MRI, MIPatternMatch::m_ZeroInt())) {
+ // No accumulate: result is just the multiplication, carry is 0.
+ B.buildMergeLikeInstr(Dst0, {DstLo, DstHi});
+ B.buildConstant(Dst1, 0);
+ } else {
+ // Accumulate: add Src2 to the multiplication result with carry chain.
+ Register Src2Lo = MRI.createVirtualRegister(SgprRB_S32);
+ Register Src2Hi = MRI.createVirtualRegister(SgprRB_S32);
+ B.buildUnmerge({Src2Lo, Src2Hi}, Src2);
+
+ auto AddLo = B.buildUAddo(SgprRB_S32, SgprRB_S32, DstLo, Src2Lo);
+ auto AddHi =
+ B.buildUAdde(SgprRB_S32, SgprRB_S32, DstHi, Src2Hi, AddLo.getReg(1));
+ B.buildMergeLikeInstr(Dst0, {AddLo.getReg(0), AddHi.getReg(0)});
+ B.buildCopy(Dst1, AddHi.getReg(1));
+ }
+
+ MI.eraseFromParent();
+ return true;
+}
+
+bool RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) {
Register Dst = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(Dst);
assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64 ||
@@ -633,9 +796,10 @@ void RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) {
B.buildMergeLikeInstr(Dst, {Lo, Hi});
MI.eraseFromParent();
+ return true;
}
-void RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &MI) {
+bool RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &MI) {
auto Op1 = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg());
int Amt = MI.getOperand(2).getImm();
Register Lo, Hi;
@@ -660,9 +824,10 @@ void RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &MI) {
B.buildMergeLikeInstr(MI.getOperand(0).getReg(), {Lo, Hi});
MI.eraseFromParent();
+ return true;
}
-void RegBankLegalizeHelper::lower(MachineInstr &MI,
+bool RegBankLegalizeHelper::lower(MachineInstr &MI,
const RegBankLLTMapping &Mapping,
SmallSet<Register, 4> &WaterfallSgprs) {
@@ -682,12 +847,14 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
B.buildSelect(MI.getOperand(0).getReg(), MI.getOperand(1).getReg(), True,
False);
MI.eraseFromParent();
- return;
+ return true;
}
case UnpackBitShift:
return lowerUnpackBitShift(MI);
case UnpackMinMax:
return lowerUnpackMinMax(MI);
+ case ScalarizeToS16:
+ return lowerSplitTo16(MI);
case Ext32To64: {
const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
MachineInstrBuilder Hi;
@@ -707,20 +874,23 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
break;
}
default:
- llvm_unreachable("Unsuported Opcode in Ext32To64");
+ reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
+ "AMDGPU RegBankLegalize: Ext32To64, unsuported opcode",
+ MI);
+ return false;
}
B.buildMergeLikeInstr(MI.getOperand(0).getReg(),
{MI.getOperand(1).getReg(), Hi});
MI.eraseFromParent();
- return;
+ return true;
}
case UniCstExt: {
uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue();
B.buildConstant(MI.getOperand(0).getReg(), ConstVal);
MI.eraseFromParent();
- return;
+ return true;
}
case VgprToVccCopy: {
Register Src = MI.getOperand(1).getReg();
@@ -744,14 +914,37 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
auto Zero = B.buildConstant({VgprRB, Ty}, 0);
B.buildICmp(CmpInst::ICMP_NE, MI.getOperand(0).getReg(), BoolSrc, Zero);
MI.eraseFromParent();
- return;
+ return true;
}
case V_BFE:
return lowerV_BFE(MI);
case S_BFE:
return lowerS_BFE(MI);
+ case UniMAD64:
+ return lowerUniMAD64(MI);
+ case UniMul64: {
+ B.buildMul(MI.getOperand(0), MI.getOperand(1), MI.getOperand(2));
+ MI.eraseFromParent();
+ return true;
+ }
+ case DivSMulToMAD: {
+ auto Op1 = B.buildTrunc(VgprRB_S32, MI.getOperand(1));
+ auto Op2 = B.buildTrunc(VgprRB_S32, MI.getOperand(2));
+ auto Zero = B.buildConstant({VgprRB, S64}, 0);
+
+ unsigned NewOpc = MI.getOpcode() == AMDGPU::G_AMDGPU_S_MUL_U64_U32
+ ? AMDGPU::G_AMDGPU_MAD_U64_U32
+ : AMDGPU::G_AMDGPU_MAD_I64_I32;
+
+ B.buildInstr(NewOpc, {MI.getOperand(0).getReg(), {SgprRB, S32}},
+ {Op1, Op2, Zero});
+ MI.eraseFromParent();
+ return true;
+ }
case SplitTo32:
return lowerSplitTo32(MI);
+ case SplitTo32Mul:
+ return lowerSplitTo32Mul(MI);
case SplitTo32Select:
return lowerSplitTo32Select(MI);
case SplitTo32SExtInReg:
@@ -773,8 +966,10 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
else if (Size / 128 == 4)
splitLoad(MI, {B128, B128, B128, B128});
else {
- LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
- llvm_unreachable("SplitLoad type not supported for MI");
+ reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
+ "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
+ MI);
+ return false;
}
}
// 64 and 32 bit load
@@ -785,10 +980,12 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
else if (DstTy == V6S16)
splitLoad(MI, {V4S16, V2S16}, V2S16);
else {
- LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
- llvm_unreachable("SplitLoad type not supported for MI");
+ reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
+ "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
+ MI);
+ return false;
}
- break;
+ return true;
}
case WidenLoad: {
LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
@@ -799,19 +996,74 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
else if (DstTy == V6S16)
widenLoad(MI, V8S16, V2S16);
else {
- LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
- llvm_unreachable("WidenLoad type not supported for MI");
+ reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
+ "AMDGPU RegBankLegalize: WidenLoad, unsuported type",
+ MI);
+ return false;
}
- break;
+ return true;
}
+ case UnpackAExt:
+ return lowerUnpackAExt(MI);
case WidenMMOToS32:
return widenMMOToS32(cast<GAnyLoad>(MI));
+ case VerifyAllSgpr: {
+ assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
+ return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
+ }));
+ return true;
+ }
+ case ApplyAllVgpr: {
+ assert(llvm::all_of(MI.defs(), [&](const MachineOperand &Op) {
+ return MRI.getRegBankOrNull(Op.getReg()) == VgprRB;
+ }));
+ B.setInstrAndDebugLoc(MI);
+ for (unsigned i = MI.getNumDefs(); i < MI.getNumOperands(); ++i) {
+ Register Reg = MI.getOperand(i).getReg();
+ if (MRI.getRegBank(Reg) != VgprRB) {
+ auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
+ MI.getOperand(i).setReg(Copy.getReg(0));
+ }
+ }
+ return true;
+ }
+ case UnmergeToShiftTrunc: {
+ GUnmerge *Unmerge = dyn_cast<GUnmerge>(&MI);
+ LLT Ty = MRI.getType(Unmerge->getSourceReg());
+ if (Ty.getSizeInBits() % 32 != 0) {
+ reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
+ "AMDGPU RegBankLegalize: unmerge not multiple of 32",
+ MI);
+ return false;
+ }
+
+ B.setInstrAndDebugLoc(MI);
+ if (Ty.getSizeInBits() > 32) {
+ auto UnmergeV2S16 =
+ B.buildUnmerge({SgprRB, V2S16}, Unmerge->getSourceReg());
+ for (unsigned i = 0; i < UnmergeV2S16->getNumDefs(); ++i) {
+ auto [Dst0S32, Dst1S32] =
+ unpackAExt(UnmergeV2S16->getOperand(i).getReg());
+ B.buildTrunc(MI.getOperand(i * 2).getReg(), Dst0S32);
+ B.buildTrunc(MI.getOperand(i * 2 + 1).getReg(), Dst1S32);
+ }
+ } else {
+ auto [Dst0S32, Dst1S32] = unpackAExt(MI.getOperand(2).getReg());
+ B.buildTrunc(MI.getOperand(0).getReg(), Dst0S32);
+ B.buildTrunc(MI.getOperand(1).getReg(), Dst1S32);
+ }
+
+ MI.eraseFromParent();
+ return true;
+ }
}
if (!WaterfallSgprs.empty()) {
MachineBasicBlock::iterator I = MI.getIterator();
- executeInWaterfallLoop(B, make_range(I, std::next(I)), WaterfallSgprs);
+ if (!executeInWaterfallLoop(B, make_range(I, std::next(I)), WaterfallSgprs))
+ return false;
}
+ return true;
}
LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
@@ -832,20 +1084,26 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
case Sgpr32ZExt:
case UniInVgprS32:
case Vgpr32:
+ case Vgpr32AExt:
case Vgpr32SExt:
case Vgpr32ZExt:
return LLT::scalar(32);
case Sgpr64:
case Vgpr64:
+ case UniInVgprS64:
return LLT::scalar(64);
case Sgpr128:
case Vgpr128:
return LLT::scalar(128);
+ case SgprP0:
case VgprP0:
return LLT::pointer(0, 64);
case SgprP1:
case VgprP1:
return LLT::pointer(1, 64);
+ case SgprP2:
+ case VgprP2:
+ return LLT::pointer(2, 32);
case SgprP3:
case VgprP3:
return LLT::pointer(3, 32);
@@ -855,18 +1113,26 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
case SgprP5:
case VgprP5:
return LLT::pointer(5, 32);
+ case SgprP8:
+ return LLT::pointer(8, 128);
case SgprV2S16:
case VgprV2S16:
case UniInVgprV2S16:
return LLT::fixed_vector(2, 16);
case SgprV2S32:
case VgprV2S32:
+ case UniInVgprV2S32:
return LLT::fixed_vector(2, 32);
+ case VgprV3S32:
+ return LLT::fixed_vector(3, 32);
case SgprV4S32:
case SgprV4S32_WF:
case VgprV4S32:
case UniInVgprV4S32:
return LLT::fixed_vector(4, 32);
+ case VgprV2S64:
+ case UniInVgprV2S64:
+ return LLT::fixed_vector(2, 64);
default:
return LLT();
}
@@ -908,7 +1174,13 @@ LLT RegBankLegalizeHelper::getBTyFromID(RegBankLLTMappingApplyID ID, LLT Ty) {
case VgprB128:
case UniInVgprB128:
if (Ty == LLT::scalar(128) || Ty == LLT::fixed_vector(4, 32) ||
- Ty == LLT::fixed_vector(2, 64) || isAnyPtr(Ty, 128))
+ Ty == LLT::fixed_vector(2, 64) || Ty == LLT::fixed_vector(8, 16) ||
+ isAnyPtr(Ty, 128))
+ return Ty;
+ return LLT();
+ case VgprB160:
+ case UniInVgprB160:
+ if (Ty.getSizeInBits() == 160)
return Ty;
return LLT();
case SgprB256:
@@ -925,6 +1197,21 @@ LLT RegBankLegalizeHelper::getBTyFromID(RegBankLLTMappingApplyID ID, LLT Ty) {
Ty == LLT::fixed_vector(8, 64))
return Ty;
return LLT();
+ case SgprBRC: {
+ const SIRegisterInfo *TRI =
+ static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
+ unsigned LLTSize = Ty.getSizeInBits();
+ if (LLTSize >= 32 && TRI->getSGPRClassForBitWidth(LLTSize))
+ return Ty;
+ return LLT();
+ }
+ case VgprBRC: {
+ const SIRegisterInfo *TRI =
+ static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
+ if (TRI->getSGPRClassForBitWidth(Ty.getSizeInBits()))
+ return Ty;
+ return LLT();
+ }
default:
return LLT();
}
@@ -940,10 +1227,13 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
case Sgpr32_WF:
case Sgpr64:
case Sgpr128:
+ case SgprP0:
case SgprP1:
+ case SgprP2:
case SgprP3:
case SgprP4:
case SgprP5:
+ case SgprP8:
case SgprPtr32:
case SgprPtr64:
case SgprPtr128:
@@ -957,15 +1247,20 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
case SgprB128:
case SgprB256:
case SgprB512:
+ case SgprBRC:
case UniInVcc:
case UniInVgprS16:
case UniInVgprS32:
+ case UniInVgprS64:
case UniInVgprV2S16:
+ case UniInVgprV2S32:
case UniInVgprV4S32:
+ case UniInVgprV2S64:
case UniInVgprB32:
case UniInVgprB64:
case UniInVgprB96:
case UniInVgprB128:
+ case UniInVgprB160:
case UniInVgprB256:
case UniInVgprB512:
case Sgpr32Trunc:
@@ -980,6 +1275,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
case Vgpr128:
case VgprP0:
case VgprP1:
+ case VgprP2:
case VgprP3:
case VgprP4:
case VgprP5:
@@ -988,13 +1284,18 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
case VgprPtr128:
case VgprV2S16:
case VgprV2S32:
+ case VgprV2S64:
+ case VgprV3S32:
case VgprV4S32:
case VgprB32:
case VgprB64:
case VgprB96:
case VgprB128:
+ case VgprB160:
case VgprB256:
case VgprB512:
+ case VgprBRC:
+ case Vgpr32AExt:
case Vgpr32SExt:
case Vgpr32ZExt:
return VgprRB;
@@ -1003,7 +1304,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
}
}
-void RegBankLegalizeHelper::applyMappingDst(
+bool RegBankLegalizeHelper::applyMappingDst(
MachineInstr &MI, unsigned &OpIdx,
const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs) {
// Defs start from operand 0
@@ -1022,10 +1323,12 @@ void RegBankLegalizeHelper::applyMappingDst(
case Sgpr32:
case Sgpr64:
case Sgpr128:
+ case SgprP0:
case SgprP1:
case SgprP3:
case SgprP4:
case SgprP5:
+ case SgprP8:
case SgprV2S16:
case SgprV2S32:
case SgprV4S32:
@@ -1035,11 +1338,14 @@ void RegBankLegalizeHelper::applyMappingDst(
case Vgpr128:
case VgprP0:
case VgprP1:
+ case VgprP2:
case VgprP3:
case VgprP4:
case VgprP5:
case VgprV2S16:
case VgprV2S32:
+ case VgprV2S64:
+ case VgprV3S32:
case VgprV4S32: {
assert(Ty == getTyFromID(MethodIDs[OpIdx]));
assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
@@ -1052,6 +1358,7 @@ void RegBankLegalizeHelper::applyMappingDst(
case SgprB128:
case SgprB256:
case SgprB512:
+ case SgprBRC:
case SgprPtr32:
case SgprPtr64:
case SgprPtr128:
@@ -1059,8 +1366,10 @@ void RegBankLegalizeHelper::applyMappingDst(
case VgprB64:
case VgprB96:
case VgprB128:
+ case VgprB160:
case VgprB256:
case VgprB512:
+ case VgprBRC:
case VgprPtr32:
case VgprPtr64:
case VgprPtr128: {
@@ -1074,9 +1383,11 @@ void RegBankLegalizeHelper::applyMappingDst(
assert(RB == SgprRB);
Register NewDst = MRI.createVirtualRegister(VccRB_S1);
Op.setReg(NewDst);
- auto CopyS32_Vcc =
- B.buildInstr(AMDGPU::G_AMDGPU_COPY_SCC_VCC, {SgprRB_S32}, {NewDst});
- B.buildTrunc(Reg, CopyS32_Vcc);
+ if (!MRI.use_empty(Reg)) {
+ auto CopyS32_Vcc =
+ B.buildInstr(AMDGPU::G_AMDGPU_COPY_SCC_VCC, {SgprRB_S32}, {NewDst});
+ B.buildTrunc(Reg, CopyS32_Vcc);
+ }
break;
}
case UniInVgprS16: {
@@ -1092,8 +1403,11 @@ void RegBankLegalizeHelper::applyMappingDst(
break;
}
case UniInVgprS32:
+ case UniInVgprS64:
case UniInVgprV2S16:
- case UniInVgprV4S32: {
+ case UniInVgprV2S32:
+ case UniInVgprV4S32:
+ case UniInVgprV2S64: {
assert(Ty == getTyFromID(MethodIDs[OpIdx]));
assert(RB == SgprRB);
Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
@@ -1105,6 +1419,7 @@ void RegBankLegalizeHelper::applyMappingDst(
case UniInVgprB64:
case UniInVgprB96:
case UniInVgprB128:
+ case UniInVgprB160:
case UniInVgprB256:
case UniInVgprB512: {
assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
@@ -1120,20 +1435,28 @@ void RegBankLegalizeHelper::applyMappingDst(
assert(RB == SgprRB);
Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
Op.setReg(NewDst);
- B.buildTrunc(Reg, NewDst);
+ if (!MRI.use_empty(Reg))
+ B.buildTrunc(Reg, NewDst);
break;
}
case InvalidMapping: {
- LLVM_DEBUG(dbgs() << "Instruction with Invalid mapping: "; MI.dump(););
- llvm_unreachable("missing fast rule for MI");
+ reportGISelFailure(
+ MF, MORE, "amdgpu-regbanklegalize",
+ "AMDGPU RegBankLegalize: missing fast rule ('Div' or 'Uni') for", MI);
+ return false;
}
default:
- llvm_unreachable("ID not supported");
+ reportGISelFailure(
+ MF, MORE, "amdgpu-regbanklegalize",
+ "AMDGPU RegBankLegalize: applyMappingDst, ID not supported", MI);
+ return false;
}
}
+
+ return true;
}
-void RegBankLegalizeHelper::applyMappingSrc(
+bool RegBankLegalizeHelper::applyMappingSrc(
MachineInstr &MI, unsigned &OpIdx,
const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs,
SmallSet<Register, 4> &SgprWaterfallOperandRegs) {
@@ -1163,10 +1486,12 @@ void RegBankLegalizeHelper::applyMappingSrc(
case Sgpr32:
case Sgpr64:
case Sgpr128:
+ case SgprP0:
case SgprP1:
case SgprP3:
case SgprP4:
case SgprP5:
+ case SgprP8:
case SgprV2S16:
case SgprV2S32:
case SgprV4S32: {
@@ -1181,6 +1506,7 @@ void RegBankLegalizeHelper::applyMappingSrc(
case SgprB128:
case SgprB256:
case SgprB512:
+ case SgprBRC:
case SgprPtr32:
case SgprPtr64:
case SgprPtr128: {
@@ -1195,11 +1521,14 @@ void RegBankLegalizeHelper::applyMappingSrc(
case Vgpr128:
case VgprP0:
case VgprP1:
+ case VgprP2:
case VgprP3:
case VgprP4:
case VgprP5:
case VgprV2S16:
case VgprV2S32:
+ case VgprV2S64:
+ case VgprV3S32:
case VgprV4S32: {
assert(Ty == getTyFromID(MethodIDs[i]));
if (RB != VgprRB) {
@@ -1213,8 +1542,10 @@ void RegBankLegalizeHelper::applyMappingSrc(
case VgprB64:
case VgprB96:
case VgprB128:
+ case VgprB160:
case VgprB256:
case VgprB512:
+ case VgprBRC:
case VgprPtr32:
case VgprPtr64:
case VgprPtr128: {
@@ -1268,6 +1599,13 @@ void RegBankLegalizeHelper::applyMappingSrc(
Op.setReg(Zext.getReg(0));
break;
}
+ case Vgpr32AExt: {
+ assert(Ty.getSizeInBits() < 32);
+ assert(RB == VgprRB);
+ auto Aext = B.buildAnyExt({VgprRB, S32}, Reg);
+ Op.setReg(Aext.getReg(0));
+ break;
+ }
case Vgpr32SExt: {
// Note this ext allows S1, and it is meant to be combined away.
assert(Ty.getSizeInBits() < 32);
@@ -1285,12 +1623,16 @@ void RegBankLegalizeHelper::applyMappingSrc(
break;
}
default:
- llvm_unreachable("ID not supported");
+ reportGISelFailure(
+ MF, MORE, "amdgpu-regbanklegalize",
+ "AMDGPU RegBankLegalize: applyMappingSrc, ID not supported", MI);
+ return false;
}
}
+ return true;
}
-void RegBankLegalizeHelper::applyMappingPHI(MachineInstr &MI) {
+bool RegBankLegalizeHelper::applyMappingPHI(MachineInstr &MI) {
Register Dst = MI.getOperand(0).getReg();
LLT Ty = MRI.getType(Dst);
@@ -1313,16 +1655,17 @@ void RegBankLegalizeHelper::applyMappingPHI(MachineInstr &MI) {
MI.getOperand(i).setReg(NewUse.getReg(0));
}
- return;
+ return true;
}
- // ALL divergent i1 phis should be already lowered and inst-selected into PHI
- // with sgpr reg class and S1 LLT.
+ // ALL divergent i1 phis should have been lowered and inst-selected into PHI
+ // with sgpr reg class and S1 LLT in AMDGPUGlobalISelDivergenceLowering pass.
// Note: this includes divergent phis that don't require lowering.
if (Ty == LLT::scalar(1) && MUI.isDivergent(Dst)) {
- LLVM_DEBUG(dbgs() << "Divergent S1 G_PHI: "; MI.dump(););
- llvm_unreachable("Make sure to run AMDGPUGlobalISelDivergenceLowering "
- "before RegBankLegalize to lower lane mask(vcc) phis");
+ reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
+ "AMDGPU RegBankLegalize: Can't lower divergent S1 G_PHI",
+ MI);
+ return false;
}
// We accept all types that can fit in some register class.
@@ -1330,11 +1673,13 @@ void RegBankLegalizeHelper::applyMappingPHI(MachineInstr &MI) {
// Divergent G_PHIs have vgpr dst but inputs can be sgpr or vgpr.
if (Ty == LLT::scalar(32) || Ty == LLT::pointer(1, 64) ||
Ty == LLT::pointer(4, 64)) {
- return;
+ return true;
}
- LLVM_DEBUG(dbgs() << "G_PHI not handled: "; MI.dump(););
- llvm_unreachable("type not supported");
+ reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
+ "AMDGPU RegBankLegalize: type not supported for G_PHI",
+ MI);
+ return false;
}
[[maybe_unused]] static bool verifyRegBankOnOperands(MachineInstr &MI,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
index d937815..86669ae 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
@@ -12,6 +12,7 @@
#include "AMDGPURegBankLegalizeRules.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
namespace llvm {
@@ -27,11 +28,13 @@ namespace AMDGPU {
// to replace instruction. In other case InstApplyMethod will create new
// instruction(s).
class RegBankLegalizeHelper {
+ MachineFunction &MF;
const GCNSubtarget &ST;
MachineIRBuilder &B;
MachineRegisterInfo &MRI;
const MachineUniformityInfo &MUI;
const RegisterBankInfo &RBI;
+ MachineOptimizationRemarkEmitter MORE;
const RegBankLegalizeRules &RBLRules;
const bool IsWave32;
const RegisterBank *SgprRB;
@@ -72,6 +75,7 @@ class RegBankLegalizeHelper {
static constexpr LLT P6 = LLT::pointer(6, 32);
MachineRegisterInfo::VRegAttrs SgprRB_S32 = {SgprRB, S32};
+ MachineRegisterInfo::VRegAttrs SgprRB_S16 = {SgprRB, S16};
MachineRegisterInfo::VRegAttrs VgprRB_S32 = {VgprRB, S32};
MachineRegisterInfo::VRegAttrs VccRB_S1 = {VccRB, S1};
@@ -80,10 +84,10 @@ public:
const RegisterBankInfo &RBI,
const RegBankLegalizeRules &RBLRules);
- void findRuleAndApplyMapping(MachineInstr &MI);
+ bool findRuleAndApplyMapping(MachineInstr &MI);
// Manual apply helpers.
- void applyMappingPHI(MachineInstr &MI);
+ bool applyMappingPHI(MachineInstr &MI);
void applyMappingTrivial(MachineInstr &MI);
private:
@@ -96,34 +100,39 @@ private:
const RegisterBank *getRegBankFromID(RegBankLLTMappingApplyID ID);
- void
+ bool
applyMappingDst(MachineInstr &MI, unsigned &OpIdx,
const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs);
- void
+ bool
applyMappingSrc(MachineInstr &MI, unsigned &OpIdx,
const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs,
SmallSet<Register, 4> &SgprWaterfallOperandRegs);
- void splitLoad(MachineInstr &MI, ArrayRef<LLT> LLTBreakdown,
+ bool splitLoad(MachineInstr &MI, ArrayRef<LLT> LLTBreakdown,
LLT MergeTy = LLT());
- void widenLoad(MachineInstr &MI, LLT WideTy, LLT MergeTy = LLT());
- void widenMMOToS32(GAnyLoad &MI) const;
+ bool widenLoad(MachineInstr &MI, LLT WideTy, LLT MergeTy = LLT());
+ bool widenMMOToS32(GAnyLoad &MI) const;
- void lower(MachineInstr &MI, const RegBankLLTMapping &Mapping,
+ bool lower(MachineInstr &MI, const RegBankLLTMapping &Mapping,
SmallSet<Register, 4> &SgprWaterfallOperandRegs);
- void lowerVccExtToSel(MachineInstr &MI);
+ bool lowerVccExtToSel(MachineInstr &MI);
std::pair<Register, Register> unpackZExt(Register Reg);
std::pair<Register, Register> unpackSExt(Register Reg);
std::pair<Register, Register> unpackAExt(Register Reg);
- void lowerUnpackBitShift(MachineInstr &MI);
- void lowerV_BFE(MachineInstr &MI);
- void lowerS_BFE(MachineInstr &MI);
- void lowerSplitTo32(MachineInstr &MI);
- void lowerSplitTo32Select(MachineInstr &MI);
- void lowerSplitTo32SExtInReg(MachineInstr &MI);
- void lowerUnpackMinMax(MachineInstr &MI);
+ std::pair<Register, Register> unpackAExtTruncS16(Register Reg);
+ bool lowerUnpackBitShift(MachineInstr &MI);
+ bool lowerV_BFE(MachineInstr &MI);
+ bool lowerS_BFE(MachineInstr &MI);
+ bool lowerUniMAD64(MachineInstr &MI);
+ bool lowerSplitTo32(MachineInstr &MI);
+ bool lowerSplitTo32Mul(MachineInstr &MI);
+ bool lowerSplitTo16(MachineInstr &MI);
+ bool lowerSplitTo32Select(MachineInstr &MI);
+ bool lowerSplitTo32SExtInReg(MachineInstr &MI);
+ bool lowerUnpackMinMax(MachineInstr &MI);
+ bool lowerUnpackAExt(MachineInstr &MI);
};
} // end namespace AMDGPU
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index a67b12a..a0be07d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -60,20 +60,28 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
return MRI.getType(Reg) == LLT::pointer(0, 64);
case P1:
return MRI.getType(Reg) == LLT::pointer(1, 64);
+ case P2:
+ return MRI.getType(Reg) == LLT::pointer(2, 32);
case P3:
return MRI.getType(Reg) == LLT::pointer(3, 32);
case P4:
return MRI.getType(Reg) == LLT::pointer(4, 64);
case P5:
return MRI.getType(Reg) == LLT::pointer(5, 32);
+ case P8:
+ return MRI.getType(Reg) == LLT::pointer(8, 128);
case Ptr32:
return isAnyPtr(MRI.getType(Reg), 32);
case Ptr64:
return isAnyPtr(MRI.getType(Reg), 64);
case Ptr128:
return isAnyPtr(MRI.getType(Reg), 128);
+ case V2S16:
+ return MRI.getType(Reg) == LLT::fixed_vector(2, 16);
case V2S32:
return MRI.getType(Reg) == LLT::fixed_vector(2, 32);
+ case V3S32:
+ return MRI.getType(Reg) == LLT::fixed_vector(3, 32);
case V4S32:
return MRI.getType(Reg) == LLT::fixed_vector(4, 32);
case B32:
@@ -84,6 +92,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
return MRI.getType(Reg).getSizeInBits() == 96;
case B128:
return MRI.getType(Reg).getSizeInBits() == 128;
+ case B160:
+ return MRI.getType(Reg).getSizeInBits() == 160;
case B256:
return MRI.getType(Reg).getSizeInBits() == 256;
case B512:
@@ -102,12 +112,16 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
return MRI.getType(Reg) == LLT::pointer(0, 64) && MUI.isUniform(Reg);
case UniP1:
return MRI.getType(Reg) == LLT::pointer(1, 64) && MUI.isUniform(Reg);
+ case UniP2:
+ return MRI.getType(Reg) == LLT::pointer(2, 32) && MUI.isUniform(Reg);
case UniP3:
return MRI.getType(Reg) == LLT::pointer(3, 32) && MUI.isUniform(Reg);
case UniP4:
return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isUniform(Reg);
case UniP5:
return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isUniform(Reg);
+ case UniP8:
+ return MRI.getType(Reg) == LLT::pointer(8, 128) && MUI.isUniform(Reg);
case UniPtr32:
return isAnyPtr(MRI.getType(Reg), 32) && MUI.isUniform(Reg);
case UniPtr64:
@@ -116,6 +130,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
return isAnyPtr(MRI.getType(Reg), 128) && MUI.isUniform(Reg);
case UniV2S16:
return MRI.getType(Reg) == LLT::fixed_vector(2, 16) && MUI.isUniform(Reg);
+ case UniV2S32:
+ return MRI.getType(Reg) == LLT::fixed_vector(2, 32) && MUI.isUniform(Reg);
case UniB32:
return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isUniform(Reg);
case UniB64:
@@ -124,10 +140,23 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
return MRI.getType(Reg).getSizeInBits() == 96 && MUI.isUniform(Reg);
case UniB128:
return MRI.getType(Reg).getSizeInBits() == 128 && MUI.isUniform(Reg);
+ case UniB160:
+ return MRI.getType(Reg).getSizeInBits() == 160 && MUI.isUniform(Reg);
case UniB256:
return MRI.getType(Reg).getSizeInBits() == 256 && MUI.isUniform(Reg);
case UniB512:
return MRI.getType(Reg).getSizeInBits() == 512 && MUI.isUniform(Reg);
+ case UniBRC: {
+ if (!MUI.isUniform(Reg))
+ return false;
+ // Check if there is SGPR register class of same size as the LLT.
+ const SIRegisterInfo *TRI =
+ static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
+ // There is no 16 bit SGPR register class. Extra size check is required
+ // since getSGPRClassForBitWidth returns SReg_32RegClass for Size 16.
+ unsigned LLTSize = MRI.getType(Reg).getSizeInBits();
+ return LLTSize >= 32 && TRI->getSGPRClassForBitWidth(LLTSize);
+ }
case DivS1:
return MRI.getType(Reg) == LLT::scalar(1) && MUI.isDivergent(Reg);
case DivS16:
@@ -142,6 +171,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
return MRI.getType(Reg) == LLT::pointer(0, 64) && MUI.isDivergent(Reg);
case DivP1:
return MRI.getType(Reg) == LLT::pointer(1, 64) && MUI.isDivergent(Reg);
+ case DivP2:
+ return MRI.getType(Reg) == LLT::pointer(2, 32) && MUI.isDivergent(Reg);
case DivP3:
return MRI.getType(Reg) == LLT::pointer(3, 32) && MUI.isDivergent(Reg);
case DivP4:
@@ -156,6 +187,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
return isAnyPtr(MRI.getType(Reg), 128) && MUI.isDivergent(Reg);
case DivV2S16:
return MRI.getType(Reg) == LLT::fixed_vector(2, 16) && MUI.isDivergent(Reg);
+ case DivV2S32:
+ return MRI.getType(Reg) == LLT::fixed_vector(2, 32) && MUI.isDivergent(Reg);
case DivB32:
return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isDivergent(Reg);
case DivB64:
@@ -164,10 +197,20 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
return MRI.getType(Reg).getSizeInBits() == 96 && MUI.isDivergent(Reg);
case DivB128:
return MRI.getType(Reg).getSizeInBits() == 128 && MUI.isDivergent(Reg);
+ case DivB160:
+ return MRI.getType(Reg).getSizeInBits() == 160 && MUI.isDivergent(Reg);
case DivB256:
return MRI.getType(Reg).getSizeInBits() == 256 && MUI.isDivergent(Reg);
case DivB512:
return MRI.getType(Reg).getSizeInBits() == 512 && MUI.isDivergent(Reg);
+ case DivBRC: {
+ if (!MUI.isDivergent(Reg))
+ return false;
+ // Check if there is VGPR register class of same size as the LLT.
+ const SIRegisterInfo *TRI =
+ static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
+ return TRI->getSGPRClassForBitWidth(MRI.getType(Reg).getSizeInBits());
+ }
case _:
return true;
default:
@@ -202,7 +245,7 @@ bool PredicateMapping::match(const MachineInstr &MI,
return true;
}
-SetOfRulesForOpcode::SetOfRulesForOpcode() {}
+SetOfRulesForOpcode::SetOfRulesForOpcode() = default;
SetOfRulesForOpcode::SetOfRulesForOpcode(FastRulesTypes FastTypes)
: FastTypes(FastTypes) {}
@@ -234,12 +277,13 @@ UniformityLLTOpPredicateID LLTToBId(LLT Ty) {
return B64;
if (Ty == LLT::fixed_vector(3, 32))
return B96;
- if (Ty == LLT::fixed_vector(4, 32) || isAnyPtr(Ty, 128))
+ if (Ty == LLT::fixed_vector(4, 32) || Ty == LLT::fixed_vector(2, 64) ||
+ Ty == LLT::fixed_vector(8, 16) || isAnyPtr(Ty, 128))
return B128;
return _;
}
-const RegBankLLTMapping &
+const RegBankLLTMapping *
SetOfRulesForOpcode::findMappingForMI(const MachineInstr &MI,
const MachineRegisterInfo &MRI,
const MachineUniformityInfo &MUI) const {
@@ -256,17 +300,16 @@ SetOfRulesForOpcode::findMappingForMI(const MachineInstr &MI,
Slot = getFastPredicateSlot(LLTToId(MRI.getType(Reg)));
if (Slot != -1)
- return MUI.isUniform(Reg) ? Uni[Slot] : Div[Slot];
+ return MUI.isUniform(Reg) ? &Uni[Slot] : &Div[Slot];
}
// Slow search for more complex rules.
for (const RegBankLegalizeRule &Rule : Rules) {
if (Rule.Predicate.match(MI, MUI, MRI))
- return Rule.OperandMapping;
+ return &Rule.OperandMapping;
}
- LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
- llvm_unreachable("None of the rules defined for MI's opcode matched MI");
+ return nullptr;
}
void SetOfRulesForOpcode::addRule(RegBankLegalizeRule Rule) {
@@ -277,14 +320,14 @@ void SetOfRulesForOpcode::addFastRuleDivergent(UniformityLLTOpPredicateID Ty,
RegBankLLTMapping RuleApplyIDs) {
int Slot = getFastPredicateSlot(Ty);
assert(Slot != -1 && "Ty unsupported in this FastRulesTypes");
- Div[Slot] = RuleApplyIDs;
+ Div[Slot] = std::move(RuleApplyIDs);
}
void SetOfRulesForOpcode::addFastRuleUniform(UniformityLLTOpPredicateID Ty,
RegBankLLTMapping RuleApplyIDs) {
int Slot = getFastPredicateSlot(Ty);
assert(Slot != -1 && "Ty unsupported in this FastRulesTypes");
- Uni[Slot] = RuleApplyIDs;
+ Uni[Slot] = std::move(RuleApplyIDs);
}
int SetOfRulesForOpcode::getFastPredicateSlot(
@@ -349,7 +392,7 @@ RegBankLegalizeRules::addRulesForIOpcs(std::initializer_list<unsigned> OpcList,
return RuleSetInitializer(OpcList, IRulesAlias, IRules, FastTypes);
}
-const SetOfRulesForOpcode &
+const SetOfRulesForOpcode *
RegBankLegalizeRules::getRulesForOpc(MachineInstr &MI) const {
unsigned Opc = MI.getOpcode();
if (Opc == AMDGPU::G_INTRINSIC || Opc == AMDGPU::G_INTRINSIC_CONVERGENT ||
@@ -357,19 +400,15 @@ RegBankLegalizeRules::getRulesForOpc(MachineInstr &MI) const {
Opc == AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS) {
unsigned IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
auto IRAIt = IRulesAlias.find(IntrID);
- if (IRAIt == IRulesAlias.end()) {
- LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
- llvm_unreachable("No rules defined for intrinsic opcode");
- }
- return IRules.at(IRAIt->second);
+ if (IRAIt == IRulesAlias.end())
+ return nullptr;
+ return &IRules.at(IRAIt->second);
}
auto GRAIt = GRulesAlias.find(Opc);
- if (GRAIt == GRulesAlias.end()) {
- LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
- llvm_unreachable("No rules defined for generic opcode");
- }
- return GRules.at(GRAIt->second);
+ if (GRAIt == GRulesAlias.end())
+ return nullptr;
+ return &GRules.at(GRAIt->second);
}
// Syntactic sugar wrapper for predicate lambda that enables '&&', '||' and '!'.
@@ -470,9 +509,54 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Uni(S16, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32AExt}})
.Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
.Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
- .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}});
+ .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
+ .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, UnpackAExt})
+ .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
+ .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr64}})
+ .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}});
+
+ addRulesForGOpcs({G_UADDO, G_USUBO}, Standard)
+ .Uni(S32, {{Sgpr32, Sgpr32Trunc}, {Sgpr32, Sgpr32}})
+ .Div(S32, {{Vgpr32, Vcc}, {Vgpr32, Vgpr32}});
+
+ addRulesForGOpcs({G_UADDE, G_USUBE, G_SADDE, G_SSUBE}, Standard)
+ .Uni(S32, {{Sgpr32, Sgpr32Trunc}, {Sgpr32, Sgpr32, Sgpr32AExtBoolInReg}})
+ .Div(S32, {{Vgpr32, Vcc}, {Vgpr32, Vgpr32, Vcc}});
+
+ addRulesForGOpcs({G_UADDSAT, G_SADDSAT, G_USUBSAT, G_SSUBSAT}, Standard)
+ .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}})
+ .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
+ .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}})
+ .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
+ .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16}})
+ .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}});
+
+ bool HasVecMulU64 = ST->hasVectorMulU64();
+ addRulesForGOpcs({G_MUL}, Standard)
+ .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
+ .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
+ .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
+ .Uni(S64, {{SgprB64}, {SgprB64, SgprB64}})
+ .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16}})
+ .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
+ .Uni(S16, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32AExt}})
+ .Div(S64, {{VgprB64}, {VgprB64, VgprB64}}, HasVecMulU64)
+ .Div(S64, {{VgprB64}, {VgprB64, VgprB64}, SplitTo32Mul}, !HasVecMulU64);
+
+ bool hasMulHi = ST->hasScalarMulHiInsts();
+ addRulesForGOpcs({G_UMULH, G_SMULH}, Standard)
+ .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
+ .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasMulHi)
+ .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasMulHi);
+
+ addRulesForGOpcs({G_AMDGPU_MAD_U64_U32}, Standard)
+ .Div(S64, {{Vgpr64, Vcc}, {Vgpr32, Vgpr32, Vgpr64}})
+ .Uni(S64, {{Sgpr64, Sgpr32Trunc}, {Sgpr32, Sgpr32, Sgpr64}, UniMAD64});
- addRulesForGOpcs({G_MUL}, Standard).Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}});
+ bool HasScalarSMulU64 = ST->hasScalarSMulU64();
+ addRulesForGOpcs({G_AMDGPU_S_MUL_U64_U32, G_AMDGPU_S_MUL_I64_I32}, Standard)
+ .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr64}, UniMul64}, HasScalarSMulU64)
+ .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}, DivSMulToMAD});
addRulesForGOpcs({G_XOR, G_OR, G_AND}, StandardB)
.Any({{UniS1}, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32AExt}}})
@@ -514,6 +598,10 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
.Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
+ addRulesForGOpcs({G_FSHR}, Standard)
+ .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}})
+ .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}});
+
addRulesForGOpcs({G_FRAME_INDEX}).Any({{UniP5, _}, {{SgprP5}, {None}}});
addRulesForGOpcs({G_UBFX, G_SBFX}, Standard)
@@ -538,21 +626,56 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, UnpackMinMax})
.Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}});
- // Note: we only write S1 rules for G_IMPLICIT_DEF, G_CONSTANT, G_FCONSTANT
- // and G_FREEZE here, rest is trivially regbankselected earlier
+ // Note: we only write S1 rules for G_IMPLICIT_DEF, G_CONSTANT and G_FCONSTANT
+ // here, rest is trivially regbankselected earlier
addRulesForGOpcs({G_IMPLICIT_DEF}).Any({{UniS1}, {{Sgpr32Trunc}, {}}});
addRulesForGOpcs({G_CONSTANT})
.Any({{UniS1, _}, {{Sgpr32Trunc}, {None}, UniCstExt}});
- addRulesForGOpcs({G_FREEZE}).Any({{DivS1}, {{Vcc}, {Vcc}}});
- addRulesForGOpcs({G_ICMP})
- .Any({{UniS1, _, S32}, {{Sgpr32Trunc}, {None, Sgpr32, Sgpr32}}})
- .Any({{DivS1, _, S32}, {{Vcc}, {None, Vgpr32, Vgpr32}}})
- .Any({{DivS1, _, S64}, {{Vcc}, {None, Vgpr64, Vgpr64}}});
+ addRulesForGOpcs({G_FREEZE})
+ .Any({{UniS1}, {{Sgpr32Trunc}, {Sgpr32AExt}}})
+ .Any({{DivS1}, {{Vcc}, {Vcc}}})
+ .Any({{UniS16}, {{Sgpr16}, {Sgpr16}}})
+ .Any({{UniBRC}, {{SgprBRC}, {SgprBRC}}})
+ .Any({{DivBRC}, {{VgprBRC}, {VgprBRC}}});
+
+ addRulesForGOpcs({G_UNMERGE_VALUES})
+ .Any({{UniS16}, {{}, {}, UnmergeToShiftTrunc}})
+ .Any({{UniBRC}, {{}, {}, VerifyAllSgpr}})
+ .Any({{DivBRC}, {{}, {}, ApplyAllVgpr}});
+
+ Predicate isSignedICmp([](const MachineInstr &MI) -> bool {
+ auto Pred =
+ static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
+ return CmpInst::isSigned(Pred);
+ });
+
+ Predicate isEqualityICmp([](const MachineInstr &MI) -> bool {
+ auto Pred =
+ static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
+ return ICmpInst::isEquality(Pred);
+ });
- addRulesForGOpcs({G_FCMP})
- .Any({{UniS1, _, S32}, {{UniInVcc}, {None, Vgpr32, Vgpr32}}})
- .Any({{DivS1, _, S32}, {{Vcc}, {None, Vgpr32, Vgpr32}}});
+ bool HasScalarCompareEq64 = ST->hasScalarCompareEq64();
+ // clang-format off
+ addRulesForGOpcs({G_ICMP})
+ .Any({{{UniS1, _, S16}, isEqualityICmp}, {{Sgpr32Trunc}, {None, Sgpr32ZExt, Sgpr32ZExt}}})
+ .Any({{{UniS1, _, S16}, !isEqualityICmp && isSignedICmp}, {{Sgpr32Trunc}, {None, Sgpr32SExt, Sgpr32SExt}}})
+ .Any({{{UniS1, _, S16}, !isEqualityICmp && !isSignedICmp}, {{Sgpr32Trunc}, {None, Sgpr32ZExt, Sgpr32ZExt}}})
+ .Any({{{DivS1, _, S16}}, {{Vcc}, {None, Vgpr16, Vgpr16}}})
+ .Any({{{UniS1, _, S32}}, {{Sgpr32Trunc}, {None, Sgpr32, Sgpr32}}})
+ .Any({{{DivS1, _, S32}}, {{Vcc}, {None, Vgpr32, Vgpr32}}})
+ .Any({{{UniS1, _, S64}, isEqualityICmp}, {{Sgpr32Trunc}, {None, Sgpr64, Sgpr64}}}, HasScalarCompareEq64)
+ .Any({{{UniS1, _, S64}, isEqualityICmp}, {{UniInVcc}, {None, Vgpr64, Vgpr64}}}, !HasScalarCompareEq64)
+ .Any({{{UniS1, _, S64}, !isEqualityICmp}, {{UniInVcc}, {None, Vgpr64, Vgpr64}}})
+ .Any({{{DivS1, _, S64}}, {{Vcc}, {None, Vgpr64, Vgpr64}}})
+ .Any({{{UniS1, _, Ptr32}}, {{Sgpr32Trunc}, {None, SgprPtr32, SgprPtr32}}})
+ .Any({{{DivS1, _, Ptr32}}, {{Vcc}, {None, VgprPtr32, VgprPtr32}}})
+ .Any({{{UniS1, _, Ptr64}, isEqualityICmp}, {{Sgpr32Trunc}, {None, SgprPtr64, SgprPtr64}}}, HasScalarCompareEq64)
+ .Any({{{UniS1, _, Ptr64}, isEqualityICmp}, {{UniInVcc}, {None, VgprPtr64, VgprPtr64}}}, !HasScalarCompareEq64)
+ .Any({{{UniS1, _, Ptr64}, !isEqualityICmp}, {{UniInVcc}, {None, VgprPtr64, VgprPtr64}}})
+ .Any({{{DivS1, _, Ptr64}}, {{Vcc}, {None, VgprPtr64, VgprPtr64}}});
+ // clang-format on
addRulesForGOpcs({G_BRCOND})
.Any({{UniS1}, {{}, {Sgpr32AExtBoolInReg}}})
@@ -580,6 +703,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}})
.Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}});
+ bool Has16bitCmp = ST->has16BitInsts();
+
// In global-isel G_TRUNC in-reg is treated as no-op, inst selected into COPY.
// It is up to user to deal with truncated bits.
addRulesForGOpcs({G_TRUNC})
@@ -593,7 +718,9 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Any({{UniV2S16, V2S32}, {{SgprV2S16}, {SgprV2S32}}})
.Any({{DivV2S16, V2S32}, {{VgprV2S16}, {VgprV2S32}}})
// This is non-trivial. VgprToVccCopy is done using compare instruction.
- .Any({{DivS1, DivS16}, {{Vcc}, {Vgpr16}, VgprToVccCopy}})
+ .Any({{DivS1, DivS16}, {{Vcc}, {Vgpr16}, VgprToVccCopy}}, Has16bitCmp)
+ .Any({{DivS1, DivS16}, {{Vcc}, {Vgpr32AExt}, VgprToVccCopy}},
+ !Has16bitCmp)
.Any({{DivS1, DivS32}, {{Vcc}, {Vgpr32}, VgprToVccCopy}})
.Any({{DivS1, DivS64}, {{Vcc}, {Vgpr64}, VgprToVccCopy}});
@@ -639,6 +766,64 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Uni(S64, {{Sgpr64}, {Sgpr64, Imm}})
.Div(S64, {{Vgpr64}, {Vgpr64, Imm}});
+ // Atomic read-modify-write operations: result and value are always VGPR,
+ // pointer varies by address space.
+ addRulesForGOpcs({G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, G_ATOMICRMW_XCHG,
+ G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
+ G_ATOMICRMW_MIN, G_ATOMICRMW_MAX, G_ATOMICRMW_UMIN,
+ G_ATOMICRMW_UMAX, G_ATOMICRMW_UINC_WRAP,
+ G_ATOMICRMW_UDEC_WRAP})
+ .Any({{DivS32, P0, S32}, {{Vgpr32}, {VgprP0, Vgpr32}}})
+ .Any({{DivS64, P0, S64}, {{Vgpr64}, {VgprP0, Vgpr64}}})
+ .Any({{DivS32, P1, S32}, {{Vgpr32}, {VgprP1, Vgpr32}}})
+ .Any({{DivS64, P1, S64}, {{Vgpr64}, {VgprP1, Vgpr64}}})
+ .Any({{DivS32, P3, S32}, {{Vgpr32}, {VgprP3, Vgpr32}}})
+ .Any({{DivS64, P3, S64}, {{Vgpr64}, {VgprP3, Vgpr64}}});
+
+ bool HasAtomicFlatPkAdd16Insts = ST->hasAtomicFlatPkAdd16Insts();
+ bool HasAtomicBufferGlobalPkAddF16Insts =
+ ST->hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
+ ST->hasAtomicBufferGlobalPkAddF16Insts();
+ bool HasAtomicDsPkAdd16Insts = ST->hasAtomicDsPkAdd16Insts();
+ addRulesForGOpcs({G_ATOMICRMW_FADD})
+ .Any({{DivS32, P0, S32}, {{Vgpr32}, {VgprP0, Vgpr32}}})
+ .Any({{DivS64, P0, S64}, {{Vgpr64}, {VgprP0, Vgpr64}}})
+ .Any({{DivS32, P1, S32}, {{Vgpr32}, {VgprP1, Vgpr32}}})
+ .Any({{DivS64, P1, S64}, {{Vgpr64}, {VgprP1, Vgpr64}}})
+ .Any({{DivS32, P3, S32}, {{Vgpr32}, {VgprP3, Vgpr32}}})
+ .Any({{DivS64, P3, S64}, {{Vgpr64}, {VgprP3, Vgpr64}}})
+ .Any({{DivV2S16, P0, V2S16}, {{VgprV2S16}, {VgprP0, VgprV2S16}}},
+ HasAtomicFlatPkAdd16Insts)
+ .Any({{DivV2S16, P1, V2S16}, {{VgprV2S16}, {VgprP1, VgprV2S16}}},
+ HasAtomicBufferGlobalPkAddF16Insts)
+ .Any({{DivV2S16, P3, V2S16}, {{VgprV2S16}, {VgprP3, VgprV2S16}}},
+ HasAtomicDsPkAdd16Insts);
+
+ addRulesForGOpcs({G_ATOMIC_CMPXCHG})
+ .Any({{DivS32, P2}, {{Vgpr32}, {VgprP2, Vgpr32, Vgpr32}}})
+ .Any({{DivS64, P2}, {{Vgpr64}, {VgprP2, Vgpr64, Vgpr64}}})
+ .Any({{DivS32, P3}, {{Vgpr32}, {VgprP3, Vgpr32, Vgpr32}}})
+ .Any({{DivS64, P3}, {{Vgpr64}, {VgprP3, Vgpr64, Vgpr64}}});
+
+ addRulesForGOpcs({G_AMDGPU_ATOMIC_CMPXCHG})
+ .Any({{DivS32, P0}, {{Vgpr32}, {VgprP0, VgprV2S32}}})
+ .Any({{DivS32, P1}, {{Vgpr32}, {VgprP1, VgprV2S32}}})
+ .Any({{DivS64, P0}, {{Vgpr64}, {VgprP0, VgprV2S64}}})
+ .Any({{DivS64, P1}, {{Vgpr64}, {VgprP1, VgprV2S64}}});
+
+ addRulesForGOpcs({G_AMDGPU_BUFFER_ATOMIC_CMPSWAP}, Standard)
+ .Div(S32, {{Vgpr32},
+ {Vgpr32, Vgpr32, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+ .Div(S64, {{Vgpr64},
+ {Vgpr64, Vgpr64, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}});
+
+ addRulesForGOpcs({G_AMDGPU_BUFFER_ATOMIC_SWAP, G_AMDGPU_BUFFER_ATOMIC_UMAX,
+ G_AMDGPU_BUFFER_ATOMIC_UMIN, G_AMDGPU_BUFFER_ATOMIC_SMAX,
+ G_AMDGPU_BUFFER_ATOMIC_SMIN},
+ Standard)
+ .Div(S32, {{Vgpr32}, {Vgpr32, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+ .Div(S64, {{Vgpr64}, {Vgpr64, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}});
+
bool hasSMRDx3 = ST->hasScalarDwordx3Loads();
bool hasSMRDSmall = ST->hasScalarSubwordLoads();
bool usesTrue16 = ST->useRealTrue16Insts();
@@ -860,6 +1045,7 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Any({{B64, Ptr32}, {{}, {VgprB64, VgprPtr32}}})
.Any({{B96, Ptr32}, {{}, {VgprB96, VgprPtr32}}})
.Any({{B128, Ptr32}, {{}, {VgprB128, VgprPtr32}}});
+
// clang-format on
addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD, G_AMDGPU_BUFFER_LOAD_FORMAT,
@@ -874,8 +1060,49 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Div(B128, {{VgprB128}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
.Uni(B128, {{UniInVgprB128}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}});
- addRulesForGOpcs({G_AMDGPU_BUFFER_STORE})
- .Any({{S32}, {{}, {Vgpr32, SgprV4S32, Vgpr32, Vgpr32, Sgpr32}}});
+ addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD_USHORT, G_AMDGPU_BUFFER_LOAD_UBYTE,
+ G_AMDGPU_BUFFER_LOAD_SSHORT, G_AMDGPU_BUFFER_LOAD_SBYTE},
+ StandardB)
+ .Div(B32, {{VgprB32}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+ .Uni(B32, {{UniInVgprB32}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}});
+
+ addRulesForGOpcs(
+ {G_AMDGPU_BUFFER_LOAD_UBYTE_TFE, G_AMDGPU_BUFFER_LOAD_USHORT_TFE},
+ StandardB)
+ .Div(B64, {{VgprB64}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+ .Uni(B64, {{UniInVgprB64}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}});
+
+ addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD_TFE, G_AMDGPU_BUFFER_LOAD_FORMAT_TFE},
+ StandardB)
+ .Div(B64, {{VgprB64}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+ .Uni(B64, {{UniInVgprB64}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+ .Div(B96, {{VgprB96}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+ .Uni(B96, {{UniInVgprB96}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+ .Div(B128, {{VgprB128}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+ .Uni(B128, {{UniInVgprB128}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+ .Any({{DivB160}, {{VgprB160}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}})
+ .Any({{UniB160},
+ {{UniInVgprB160}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}});
+
+ addRulesForGOpcs(
+ {G_AMDGPU_BUFFER_LOAD_FORMAT_D16, G_AMDGPU_TBUFFER_LOAD_FORMAT_D16},
+ StandardB)
+ .Div(B32, {{VgprB32}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+ .Uni(B32, {{UniInVgprB32}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+ .Div(B64, {{VgprB64}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+ .Uni(B64, {{UniInVgprB64}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+ .Div(B128, {{VgprB128}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+ .Uni(B128, {{UniInVgprB128}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}});
+
+ addRulesForGOpcs({G_AMDGPU_BUFFER_STORE, G_AMDGPU_BUFFER_STORE_BYTE,
+ G_AMDGPU_BUFFER_STORE_SHORT, G_AMDGPU_BUFFER_STORE_FORMAT,
+ G_AMDGPU_BUFFER_STORE_FORMAT_D16,
+ G_AMDGPU_TBUFFER_STORE_FORMAT,
+ G_AMDGPU_TBUFFER_STORE_FORMAT_D16})
+ .Any({{B32}, {{}, {VgprB32, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}})
+ .Any({{B64}, {{}, {VgprB64, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}})
+ .Any({{B96}, {{}, {VgprB96, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}})
+ .Any({{B128}, {{}, {VgprB128, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}});
addRulesForGOpcs({G_PTR_ADD})
.Any({{UniPtr32}, {{SgprPtr32}, {SgprPtr32, Sgpr32}}})
@@ -899,34 +1126,237 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Any({{UniS128}, {{Sgpr128}, {SgprPtr128}}})
.Any({{DivS128}, {{Vgpr128}, {VgprPtr128}}});
+ // FIXME: Update llvm/test/CodeGen/AMDGPU/ptrmask.ll to use GlobalISel.
+ // Currently crashes on P8 (buffer resource) tests due to legalizer issue.
+ addRulesForGOpcs({G_PTRMASK})
+ .Any({{UniP1}, {{SgprP1}, {SgprP1, Sgpr64}}})
+ .Any({{DivP1}, {{VgprP1}, {VgprP1, Vgpr64}}})
+ .Any({{UniP3}, {{SgprP3}, {SgprP3, Sgpr32}}})
+ .Any({{DivP3}, {{VgprP3}, {VgprP3, Vgpr32}}});
+
addRulesForGOpcs({G_ABS}, Standard).Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt}});
- addRulesForGOpcs({G_READSTEADYCOUNTER}, Standard).Uni(S64, {{Sgpr64}, {}});
+ addRulesForGOpcs({G_BITREVERSE}, Standard)
+ .Uni(S32, {{Sgpr32}, {Sgpr32}})
+ .Div(S32, {{Vgpr32}, {Vgpr32}})
+ .Uni(S64, {{Sgpr64}, {Sgpr64}})
+ .Div(S64, {{Vgpr64}, {Vgpr64}});
+
+ addRulesForGOpcs({G_FENCE}).Any({{{}}, {{}, {}}});
+
+ addRulesForGOpcs({G_READSTEADYCOUNTER, G_READCYCLECOUNTER}, Standard)
+ .Uni(S64, {{Sgpr64}, {}});
+
+ addRulesForGOpcs({G_BLOCK_ADDR}).Any({{UniP0}, {{SgprP0}, {}}});
+
+ addRulesForGOpcs({G_GLOBAL_VALUE})
+ .Any({{UniP0}, {{SgprP0}, {}}})
+ .Any({{UniP1}, {{SgprP1}, {}}})
+ .Any({{UniP3}, {{SgprP3}, {}}})
+ .Any({{UniP4}, {{SgprP4}, {}}})
+ .Any({{UniP8}, {{SgprP8}, {}}});
+
+ addRulesForGOpcs({G_AMDGPU_WAVE_ADDRESS}).Any({{UniP5}, {{SgprP5}, {}}});
bool hasSALUFloat = ST->hasSALUFloatInsts();
- addRulesForGOpcs({G_FADD}, Standard)
+ addRulesForGOpcs({G_FADD, G_FMUL, G_STRICT_FADD, G_STRICT_FMUL}, Standard)
+ .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUFloat)
+ .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUFloat)
+ .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
.Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat)
.Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat)
- .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}});
+ .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
+ .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}})
+ .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}})
+ .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16}}, !hasSALUFloat)
+ .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, ScalarizeToS16},
+ hasSALUFloat)
+ .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}});
- addRulesForGOpcs({G_FPTOUI})
- .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat)
- .Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat);
+ addRulesForGOpcs({G_FSUB, G_STRICT_FSUB}, Standard)
+ .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
+ .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
+ .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUFloat)
+ .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUFloat)
+ .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat)
+ .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat);
- addRulesForGOpcs({G_UITOFP})
+ addRulesForGOpcs({G_FMAD}, Standard)
+ .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16, Vgpr16}})
+ .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16, Vgpr16}})
+ .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}})
+ .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}});
+
+ addRulesForGOpcs({G_FLDEXP, G_STRICT_FLDEXP}, Standard)
+ .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}})
+ .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
+ .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}})
+ .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
+ .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr32}})
+ .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
+
+ addRulesForGOpcs({G_FMA, G_STRICT_FMA}, Standard)
+ .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16, Vgpr16}})
+ .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}})
+ .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64, Vgpr64}})
+ .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64, Vgpr64}})
+ .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16, VgprV2S16}})
+ .Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32, VgprV2S32, VgprV2S32}}})
+ .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32, VgprV2S32, VgprV2S32}}})
+ .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16, Sgpr16}}, hasSALUFloat)
+ .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16, Vgpr16}}, !hasSALUFloat)
+ .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32, Sgpr32}}, hasSALUFloat)
+ .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}}, !hasSALUFloat)
+ .Uni(V2S16,
+ {{SgprV2S16}, {SgprV2S16, SgprV2S16, SgprV2S16}, ScalarizeToS16},
+ hasSALUFloat)
+ .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16, VgprV2S16}},
+ !hasSALUFloat);
+
+ addRulesForGOpcs({G_AMDGPU_FMED3}, Standard)
+ .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16, Vgpr16}})
+ .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16, Vgpr16}})
+ .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}})
+ .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}});
+
+ // TODO: This opcode is generated from the i64->i16 signed clamped pattern in
+ // the PreLegalizerCombiner. Move the combine to RegBankCombiner to keep more
+ // instructions on SALU.
+ addRulesForGOpcs({G_AMDGPU_SMED3}, Standard)
+ .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}})
+ .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}});
+
+ // FNEG and FABS are either folded as source modifiers or can be selected as
+ // bitwise XOR and AND with Mask. XOR and AND are available on SALU but for
+ // targets without SALU float we still select them as VGPR since there would
+ // be no real sgpr use.
+ addRulesForGOpcs({G_FNEG, G_FABS}, Standard)
+ .Uni(S16, {{UniInVgprS16}, {Vgpr16}}, !hasSALUFloat)
+ .Uni(S16, {{Sgpr16}, {Sgpr16}}, hasSALUFloat)
+ .Div(S16, {{Vgpr16}, {Vgpr16}})
+ .Uni(S32, {{UniInVgprS32}, {Vgpr32}}, !hasSALUFloat)
+ .Uni(S32, {{Sgpr32}, {Sgpr32}}, hasSALUFloat)
+ .Div(S32, {{Vgpr32}, {Vgpr32}})
+ .Uni(S64, {{UniInVgprS64}, {Vgpr64}})
+ .Div(S64, {{Vgpr64}, {Vgpr64}})
+ .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16}}, !hasSALUFloat)
+ .Uni(V2S16, {{SgprV2S16}, {SgprV2S16}, ScalarizeToS16}, hasSALUFloat)
+ .Div(V2S16, {{VgprV2S16}, {VgprV2S16}})
+ .Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32}}})
+ .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32}}});
+
+ addRulesForGOpcs({G_FCANONICALIZE}, Standard)
+ .Uni(S32, {{UniInVgprS32}, {Vgpr32}})
+ .Div(S32, {{Vgpr32}, {Vgpr32}})
+ .Uni(S16, {{UniInVgprS16}, {Vgpr16}})
+ .Div(S16, {{Vgpr16}, {Vgpr16}})
+ .Uni(S64, {{UniInVgprS64}, {Vgpr64}})
+ .Div(S64, {{Vgpr64}, {Vgpr64}})
+ .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16}})
+ .Div(V2S16, {{VgprV2S16}, {VgprV2S16}})
+ .Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32}}})
+ .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32}}});
+
+ bool hasPST = ST->hasPseudoScalarTrans();
+ addRulesForGOpcs({G_FSQRT}, Standard)
+ .Div(S16, {{Vgpr16}, {Vgpr16}})
+ .Uni(S16, {{Sgpr16}, {Sgpr16}}, hasPST)
+ .Uni(S16, {{UniInVgprS16}, {Vgpr16}}, !hasPST);
+
+ addRulesForGOpcs({G_FPTOUI, G_FPTOSI})
+ .Any({{UniS16, S16}, {{UniInVgprS16}, {Vgpr16}}})
+ .Any({{DivS16, S16}, {{Vgpr16}, {Vgpr16}}})
+ .Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}}, hasSALUFloat)
+ .Any({{UniS32, S16}, {{UniInVgprS32}, {Vgpr16}}}, !hasSALUFloat)
+ .Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}})
+ .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat)
+ .Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat)
.Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}})
+ .Any({{UniS32, S64}, {{UniInVgprS32}, {Vgpr64}}})
+ .Any({{DivS32, S64}, {{Vgpr32}, {Vgpr64}}});
+
+ addRulesForGOpcs({G_UITOFP, G_SITOFP})
+ .Any({{UniS16, S16}, {{UniInVgprS16}, {Vgpr16}}})
+ .Any({{DivS16, S16}, {{Vgpr16}, {Vgpr16}}})
+ .Any({{UniS16, S32}, {{Sgpr16}, {Sgpr32}}}, hasSALUFloat)
+ .Any({{UniS16, S32}, {{UniInVgprS16}, {Vgpr32}}}, !hasSALUFloat)
+ .Any({{DivS16, S32}, {{Vgpr16}, {Vgpr32}}})
.Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat)
- .Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat);
+ .Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat)
+ .Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}})
+ .Any({{UniS64, S32}, {{UniInVgprS64}, {Vgpr32}}})
+ .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}}});
+
+ addRulesForGOpcs({G_FPEXT})
+ .Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}})
+ .Any({{UniS64, S32}, {{UniInVgprS64}, {Vgpr32}}})
+ .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}}})
+ .Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}}, hasSALUFloat)
+ .Any({{UniS32, S16}, {{UniInVgprS32}, {Vgpr16}}}, !hasSALUFloat);
+
+ addRulesForGOpcs({G_AMDGPU_CVT_PK_I16_I32}, Standard)
+ .Uni(V2S16, {{UniInVgprV2S16}, {Vgpr32, Vgpr32}})
+ .Div(V2S16, {{VgprV2S16}, {Vgpr32, Vgpr32}});
+
+ addRulesForGOpcs({G_AMDGPU_FMIN_LEGACY, G_AMDGPU_FMAX_LEGACY}, Standard)
+ .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}})
+ .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}});
+
+ addRulesForGOpcs({G_FPTRUNC})
+ .Any({{DivS16, S32}, {{Vgpr16}, {Vgpr32}}})
+ .Any({{UniS32, S64}, {{UniInVgprS32}, {Vgpr64}}})
+ .Any({{DivS32, S64}, {{Vgpr32}, {Vgpr64}}})
+ .Any({{UniV2S16, V2S32}, {{UniInVgprV2S16}, {VgprV2S32}}})
+ .Any({{DivV2S16, V2S32}, {{VgprV2S16}, {VgprV2S32}}})
+ .Any({{UniS16, S32}, {{Sgpr16}, {Sgpr32}}}, hasSALUFloat)
+ .Any({{UniS16, S32}, {{UniInVgprS16}, {Vgpr32}}}, !hasSALUFloat);
+
+ addRulesForGOpcs({G_IS_FPCLASS})
+ .Any({{DivS1, S16}, {{Vcc}, {Vgpr16}}})
+ .Any({{UniS1, S16}, {{UniInVcc}, {Vgpr16}}})
+ .Any({{DivS1, S32}, {{Vcc}, {Vgpr32}}})
+ .Any({{UniS1, S32}, {{UniInVcc}, {Vgpr32}}})
+ .Any({{DivS1, S64}, {{Vcc}, {Vgpr64}}})
+ .Any({{UniS1, S64}, {{UniInVcc}, {Vgpr64}}});
+
+ addRulesForGOpcs({G_FCMP}, Standard)
+ .Any({{UniS1, _, S16}, {{Sgpr32Trunc}, {None, Sgpr16, Sgpr16}}},
+ hasSALUFloat)
+ .Any({{UniS1, _, S16}, {{UniInVcc}, {None, Vgpr16, Vgpr16}}},
+ !hasSALUFloat)
+ .Any({{DivS1, _, S16}, {{Vcc}, {None, Vgpr16, Vgpr16}}})
+ .Any({{UniS1, _, S32}, {{Sgpr32Trunc}, {None, Sgpr32, Sgpr32}}},
+ hasSALUFloat)
+ .Any({{UniS1, _, S32}, {{UniInVcc}, {None, Vgpr32, Vgpr32}}},
+ !hasSALUFloat)
+ .Any({{DivS1, _, S32}, {{Vcc}, {None, Vgpr32, Vgpr32}}})
+ .Any({{UniS1, _, S64}, {{UniInVcc}, {None, Vgpr64, Vgpr64}}})
+ .Any({{DivS1, _, S64}, {{Vcc}, {None, Vgpr64, Vgpr64}}});
+
+ addRulesForGOpcs({G_INTRINSIC_TRUNC, G_INTRINSIC_ROUNDEVEN, G_FFLOOR, G_FCEIL,
+ G_FEXP2, G_FLOG2},
+ Standard)
+ .Uni(S16, {{UniInVgprS16}, {Vgpr16}})
+ .Div(S16, {{Vgpr16}, {Vgpr16}})
+ .Uni(S32, {{UniInVgprS32}, {Vgpr32}})
+ .Div(S32, {{Vgpr32}, {Vgpr32}})
+ .Uni(S64, {{UniInVgprS64}, {Vgpr64}})
+ .Div(S64, {{Vgpr64}, {Vgpr64}});
using namespace Intrinsic;
addRulesForIOpcs({amdgcn_s_getpc}).Any({{UniS64, _}, {{Sgpr64}, {None}}});
+ addRulesForIOpcs({amdgcn_groupstaticsize}).Any({{S32}, {{Sgpr32}, {IntrId}}});
+
// This is "intrinsic lane mask" it was set to i32/i64 in llvm-ir.
- addRulesForIOpcs({amdgcn_end_cf}).Any({{_, S32}, {{}, {None, Sgpr32}}});
+ addRulesForIOpcs({amdgcn_end_cf})
+ .Any({{_, UniS32}, {{}, {IntrId, Sgpr32}}})
+ .Any({{_, UniS64}, {{}, {IntrId, Sgpr64}}});
addRulesForIOpcs({amdgcn_if_break}, Standard)
+ .Uni(S64, {{Sgpr64}, {IntrId, Vcc, Sgpr64}})
.Uni(S32, {{Sgpr32}, {IntrId, Vcc, Sgpr32}});
addRulesForIOpcs({amdgcn_mbcnt_lo, amdgcn_mbcnt_hi}, Standard)
@@ -938,4 +1368,68 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
// readfirstlaning just in case register is not in sgpr.
.Any({{UniS32, _, UniS32}, {{}, {Sgpr32, None, Vgpr32}}});
+ addRulesForIOpcs({amdgcn_s_sleep}).Any({{_, _}, {{}, {IntrId, Imm}}});
+
+ addRulesForIOpcs({amdgcn_mul_u24, amdgcn_mul_i24}, Standard)
+ .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32}})
+ .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32}})
+ .Uni(S64, {{UniInVgprS64}, {IntrId, Vgpr32, Vgpr32}})
+ .Div(S64, {{Vgpr64}, {IntrId, Vgpr32, Vgpr32}});
+
+ addRulesForIOpcs({amdgcn_mulhi_u24, amdgcn_mulhi_i24, amdgcn_fmul_legacy},
+ Standard)
+ .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32}})
+ .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32}});
+
+ addRulesForIOpcs({amdgcn_fma_legacy}, Standard)
+ .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
+ .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
+
+ addRulesForIOpcs({amdgcn_frexp_mant, amdgcn_fract}, Standard)
+ .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr16}})
+ .Div(S16, {{Vgpr16}, {IntrId, Vgpr16}})
+ .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32}})
+ .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}})
+ .Uni(S64, {{UniInVgprS64}, {IntrId, Vgpr64}})
+ .Div(S64, {{Vgpr64}, {IntrId, Vgpr64}});
+
+ addRulesForIOpcs({amdgcn_prng_b32})
+ .Any({{UniS32}, {{UniInVgprS32}, {IntrId, Vgpr32}}})
+ .Any({{DivS32}, {{Vgpr32}, {IntrId, Vgpr32}}});
+
+ addRulesForIOpcs({amdgcn_sffbh}, Standard)
+ .Uni(S32, {{Sgpr32}, {IntrId, Sgpr32}})
+ .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}});
+
+ addRulesForIOpcs({amdgcn_ubfe, amdgcn_sbfe}, Standard)
+ .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
+ .Uni(S32, {{Sgpr32}, {IntrId, Sgpr32, Sgpr32, Sgpr32}, S_BFE})
+ .Uni(S64, {{Sgpr64}, {IntrId, Sgpr64, Sgpr32, Sgpr32}, S_BFE})
+ .Div(S64, {{Vgpr64}, {IntrId, Vgpr64, Vgpr32, Vgpr32}, V_BFE});
+
+ addRulesForIOpcs({amdgcn_global_load_tr_b64})
+ .Any({{DivB64}, {{VgprB64}, {IntrId, SgprP1}}})
+ .Any({{DivB32}, {{VgprB32}, {IntrId, SgprP1}}});
+
+ addRulesForIOpcs({amdgcn_global_load_tr_b128})
+ .Any({{DivB64}, {{VgprB64}, {IntrId, SgprP1}}})
+ .Any({{DivB128}, {{VgprB128}, {IntrId, SgprP1}}});
+
+ addRulesForIOpcs({amdgcn_global_atomic_ordered_add_b64})
+ .Any({{DivS64}, {{Vgpr64}, {IntrId, VgprP1, Vgpr64}}});
+
+ addRulesForIOpcs({amdgcn_wwm, amdgcn_strict_wwm}, StandardB)
+ .Div(B32, {{VgprB32}, {IntrId, VgprB32}})
+ .Uni(B32, {{SgprB32}, {IntrId, SgprB32}})
+ .Div(B64, {{VgprB64}, {IntrId, VgprB64}})
+ .Uni(B64, {{SgprB64}, {IntrId, SgprB64}})
+ .Div(B96, {{VgprB96}, {IntrId, VgprB96}})
+ .Uni(B96, {{SgprB96}, {IntrId, SgprB96}})
+ .Div(B128, {{VgprB128}, {IntrId, VgprB128}})
+ .Uni(B128, {{SgprB128}, {IntrId, SgprB128}})
+ .Any({{UniB256}, {{SgprB256}, {IntrId, SgprB256}}})
+ .Any({{DivB256}, {{VgprB256}, {IntrId, VgprB256}}})
+ .Any({{UniB512}, {{SgprB512}, {IntrId, SgprB512}}})
+ .Any({{DivB512}, {{VgprB512}, {IntrId, VgprB512}}});
+
} // end initialize rules
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
index 93e0efd..eee4f62 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
@@ -60,24 +60,29 @@ enum UniformityLLTOpPredicateID {
// pointers
P0,
P1,
+ P2,
P3,
P4,
P5,
+ P8,
Ptr32,
Ptr64,
Ptr128,
UniP0,
UniP1,
+ UniP2,
UniP3,
UniP4,
UniP5,
+ UniP8,
UniPtr32,
UniPtr64,
UniPtr128,
DivP0,
DivP1,
+ DivP2,
DivP3,
DivP4,
DivP5,
@@ -88,18 +93,24 @@ enum UniformityLLTOpPredicateID {
// vectors
V2S16,
V2S32,
+ V2S64,
V3S32,
V4S32,
UniV2S16,
+ UniV2S32,
+ UniV2S64,
DivV2S16,
+ DivV2S32,
+ DivV2S64,
// B types
B32,
B64,
B96,
B128,
+ B160,
B256,
B512,
@@ -107,15 +118,19 @@ enum UniformityLLTOpPredicateID {
UniB64,
UniB96,
UniB128,
+ UniB160,
UniB256,
UniB512,
+ UniBRC,
DivB32,
DivB64,
DivB96,
DivB128,
+ DivB160,
DivB256,
DivB512,
+ DivBRC
};
// How to apply register bank on register operand.
@@ -134,10 +149,13 @@ enum RegBankLLTMappingApplyID {
Sgpr32,
Sgpr64,
Sgpr128,
+ SgprP0,
SgprP1,
+ SgprP2,
SgprP3,
SgprP4,
SgprP5,
+ SgprP8,
SgprPtr32,
SgprPtr64,
SgprPtr128,
@@ -150,6 +168,7 @@ enum RegBankLLTMappingApplyID {
SgprB128,
SgprB256,
SgprB512,
+ SgprBRC,
// vgpr scalars, pointers, vectors and B-types
Vgpr16,
@@ -158,6 +177,7 @@ enum RegBankLLTMappingApplyID {
Vgpr128,
VgprP0,
VgprP1,
+ VgprP2,
VgprP3,
VgprP4,
VgprP5,
@@ -166,24 +186,32 @@ enum RegBankLLTMappingApplyID {
VgprPtr128,
VgprV2S16,
VgprV2S32,
+ VgprV3S32,
VgprB32,
VgprB64,
VgprB96,
VgprB128,
+ VgprB160,
VgprB256,
VgprB512,
+ VgprBRC,
VgprV4S32,
+ VgprV2S64,
// Dst only modifiers: read-any-lane and truncs
UniInVcc,
UniInVgprS16,
UniInVgprS32,
+ UniInVgprS64,
UniInVgprV2S16,
+ UniInVgprV2S32,
UniInVgprV4S32,
+ UniInVgprV2S64,
UniInVgprB32,
UniInVgprB64,
UniInVgprB96,
UniInVgprB128,
+ UniInVgprB160,
UniInVgprB256,
UniInVgprB512,
@@ -198,6 +226,7 @@ enum RegBankLLTMappingApplyID {
Sgpr32AExtBoolInReg,
Sgpr32SExt,
Sgpr32ZExt,
+ Vgpr32AExt,
Vgpr32SExt,
Vgpr32ZExt,
};
@@ -216,14 +245,23 @@ enum LoweringMethodID {
S_BFE,
V_BFE,
VgprToVccCopy,
+ UniMAD64,
+ UniMul64,
+ DivSMulToMAD,
SplitTo32,
+ SplitTo32Mul,
+ ScalarizeToS16,
SplitTo32Select,
SplitTo32SExtInReg,
Ext32To64,
UniCstExt,
SplitLoad,
WidenLoad,
- WidenMMOToS32
+ WidenMMOToS32,
+ UnpackAExt,
+ VerifyAllSgpr,
+ ApplyAllVgpr,
+ UnmergeToShiftTrunc
};
enum FastRulesTypes {
@@ -277,7 +315,7 @@ public:
SetOfRulesForOpcode();
SetOfRulesForOpcode(FastRulesTypes FastTypes);
- const RegBankLLTMapping &
+ const RegBankLLTMapping *
findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI,
const MachineUniformityInfo &MUI) const;
@@ -297,7 +335,7 @@ private:
class RegBankLegalizeRules {
const GCNSubtarget *ST;
MachineRegisterInfo *MRI;
- // Separate maps for G-opcodes and instrinsics since they are in different
+ // Separate maps for G-opcodes and intrinsics since they are in different
// enums. Multiple opcodes can share same set of rules.
// RulesAlias = map<Opcode, KeyOpcode>
// Rules = map<KeyOpcode, SetOfRulesForOpcode>
@@ -375,7 +413,7 @@ public:
MRI = &_MRI;
};
- const SetOfRulesForOpcode &getRulesForOpc(MachineInstr &MI) const;
+ const SetOfRulesForOpcode *getRulesForOpc(MachineInstr &MI) const;
};
} // end namespace AMDGPU
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 56807a4..e8f316d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -287,9 +287,6 @@ unsigned AMDGPURegisterBankInfo::getBreakDownCost(
const RegisterBank &
AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
LLT Ty) const {
- if (&RC == &AMDGPU::SReg_1RegClass)
- return AMDGPU::VCCRegBank;
-
// We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
// VCC-like use.
if (TRI->isSGPRClass(&RC)) {
@@ -471,7 +468,7 @@ RegisterBankInfo::InstructionMappings
AMDGPURegisterBankInfo::getInstrAlternativeMappings(
const MachineInstr &MI) const {
- const MachineFunction &MF = *MI.getParent()->getParent();
+ const MachineFunction &MF = *MI.getMF();
const MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -1263,11 +1260,14 @@ unsigned AMDGPURegisterBankInfo::setBufferOffsets(
}
}
+ const bool CheckNUW = Subtarget.hasGFX1250Insts();
Register Base;
unsigned Offset;
std::tie(Base, Offset) =
- AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset);
+ AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset,
+ /*KnownBits=*/nullptr,
+ /*CheckNUW=*/CheckNUW);
uint32_t SOffset, ImmOffset;
if ((int)Offset > 0 &&
@@ -1292,7 +1292,8 @@ unsigned AMDGPURegisterBankInfo::setBufferOffsets(
// Handle the variable sgpr + vgpr case.
MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI);
- if (Add && (int)Offset >= 0) {
+ if (Add && (int)Offset >= 0 &&
+ (!CheckNUW || Add->getFlag(MachineInstr::NoUWrap))) {
Register Src0 = getSrcRegIgnoringCopies(Add->getOperand(1).getReg(), *MRI);
Register Src1 = getSrcRegIgnoringCopies(Add->getOperand(2).getReg(), *MRI);
@@ -1561,8 +1562,7 @@ bool AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B,
(Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
- if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
- llvm_unreachable("failed to constrain BFE");
+ constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this);
MI.eraseFromParent();
return true;
@@ -1873,11 +1873,11 @@ bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
B.buildInstr(AMDGPU::V_MOV_B32_e32)
- .addDef(TmpReg0)
- .addUse(SrcReg, 0, AMDGPU::sub0);
+ .addDef(TmpReg0)
+ .addUse(SrcReg, {}, AMDGPU::sub0);
B.buildInstr(AMDGPU::V_MOV_B32_e32)
- .addDef(TmpReg1)
- .addUse(SrcReg, 0, AMDGPU::sub1);
+ .addDef(TmpReg1)
+ .addUse(SrcReg, {}, AMDGPU::sub1);
B.buildInstr(AMDGPU::REG_SEQUENCE)
.addDef(DstReg)
.addUse(TmpReg0)
@@ -2412,7 +2412,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
if (DstBank == &AMDGPU::VCCRegBank)
break;
- MachineFunction *MF = MI.getParent()->getParent();
+ MachineFunction *MF = MI.getMF();
ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
LegalizerHelper Helper(*MF, ApplyBank, B);
@@ -2492,7 +2492,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
// There is no VALU abs instruction so we need to replace it with a sub and
// max combination.
if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) {
- MachineFunction *MF = MI.getParent()->getParent();
+ MachineFunction *MF = MI.getMF();
ApplyRegBankMapping Apply(B, *this, MRI, &AMDGPU::VGPRRegBank);
LegalizerHelper Helper(*MF, Apply, B);
@@ -3114,6 +3114,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
@@ -3283,6 +3285,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
constrainOpWithReadfirstlane(B, MI, 2); // M0
return;
}
+ case Intrinsic::amdgcn_s_alloc_vgpr:
+ constrainOpWithReadfirstlane(B, MI, 2);
+ return;
case Intrinsic::amdgcn_s_sendmsg:
case Intrinsic::amdgcn_s_sendmsghalt: {
// FIXME: Should this use a waterfall loop?
@@ -3297,7 +3302,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
constrainOpWithReadfirstlane(B, MI, 1); // M0
return;
case Intrinsic::amdgcn_raw_buffer_load_lds:
- case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: {
+ case Intrinsic::amdgcn_raw_buffer_load_async_lds:
+ case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
+ case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds: {
applyDefaultMapping(OpdMapper);
constrainOpWithReadfirstlane(B, MI, 1); // rsrc
constrainOpWithReadfirstlane(B, MI, 2); // M0
@@ -3305,7 +3312,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
return;
}
case Intrinsic::amdgcn_struct_buffer_load_lds:
- case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
+ case Intrinsic::amdgcn_struct_buffer_load_async_lds:
+ case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
+ case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
applyDefaultMapping(OpdMapper);
constrainOpWithReadfirstlane(B, MI, 1); // rsrc
constrainOpWithReadfirstlane(B, MI, 2); // M0
@@ -3321,7 +3330,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
return;
}
case Intrinsic::amdgcn_load_to_lds:
- case Intrinsic::amdgcn_global_load_lds: {
+ case Intrinsic::amdgcn_load_async_to_lds:
+ case Intrinsic::amdgcn_global_load_lds:
+ case Intrinsic::amdgcn_global_load_async_lds: {
applyDefaultMapping(OpdMapper);
constrainOpWithReadfirstlane(B, MI, 2);
return;
@@ -3348,6 +3359,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
constrainOpWithReadfirstlane(B, MI, 1);
return;
case Intrinsic::amdgcn_s_barrier_join:
+ case Intrinsic::amdgcn_s_wakeup_barrier:
constrainOpWithReadfirstlane(B, MI, 1);
return;
case Intrinsic::amdgcn_s_barrier_init:
@@ -3496,6 +3508,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
executeInWaterfallLoop(B, make_range(Start, End), SGPROperandRegs);
break;
}
+ case AMDGPU::G_AMDGPU_FLAT_LOAD_MONITOR:
+ case AMDGPU::G_AMDGPU_GLOBAL_LOAD_MONITOR:
case AMDGPU::G_LOAD:
case AMDGPU::G_ZEXTLOAD:
case AMDGPU::G_SEXTLOAD: {
@@ -3607,7 +3621,7 @@ unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI,
}
bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
- const MachineFunction &MF = *MI.getParent()->getParent();
+ const MachineFunction &MF = *MI.getMF();
const MachineRegisterInfo &MRI = MF.getRegInfo();
for (const MachineOperand &MO : MI.operands()) {
if (!MO.isReg())
@@ -3623,7 +3637,7 @@ bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
const RegisterBankInfo::InstructionMapping &
AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
- const MachineFunction &MF = *MI.getParent()->getParent();
+ const MachineFunction &MF = *MI.getMF();
const MachineRegisterInfo &MRI = MF.getRegInfo();
SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
@@ -3641,7 +3655,7 @@ AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
const RegisterBankInfo::InstructionMapping &
AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
- const MachineFunction &MF = *MI.getParent()->getParent();
+ const MachineFunction &MF = *MI.getMF();
const MachineRegisterInfo &MRI = MF.getRegInfo();
SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
@@ -3665,7 +3679,7 @@ AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
const RegisterBankInfo::InstructionMapping &
AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
- const MachineFunction &MF = *MI.getParent()->getParent();
+ const MachineFunction &MF = *MI.getMF();
const MachineRegisterInfo &MRI = MF.getRegInfo();
SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
@@ -3744,7 +3758,7 @@ AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI,
const RegisterBankInfo::InstructionMapping &
AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
- const MachineFunction &MF = *MI.getParent()->getParent();
+ const MachineFunction &MF = *MI.getMF();
const MachineRegisterInfo &MRI = MF.getRegInfo();
SmallVector<const ValueMapping*, 2> OpdsMapping(2);
unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
@@ -3834,7 +3848,7 @@ AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
//
const RegisterBankInfo::InstructionMapping &
AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
- const MachineFunction &MF = *MI.getParent()->getParent();
+ const MachineFunction &MF = *MI.getMF();
const MachineRegisterInfo &MRI = MF.getRegInfo();
if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) {
@@ -4084,6 +4098,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
}
case AMDGPU::G_FPTOSI:
case AMDGPU::G_FPTOUI:
+ case AMDGPU::G_FPTOSI_SAT:
+ case AMDGPU::G_FPTOUI_SAT:
case AMDGPU::G_SITOFP:
case AMDGPU::G_UITOFP: {
unsigned SizeDst = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
@@ -4502,6 +4518,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
@@ -4577,6 +4595,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
break;
+ case AMDGPU::G_AMDGPU_SPONENTRY: {
+ unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
+ break;
+ }
case AMDGPU::G_INTRINSIC:
case AMDGPU::G_INTRINSIC_CONVERGENT: {
switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
@@ -4835,6 +4858,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_perm_pk16_b4_u4:
case Intrinsic::amdgcn_perm_pk16_b6_u4:
case Intrinsic::amdgcn_perm_pk16_b8_u4:
+ case Intrinsic::amdgcn_add_max_i32:
+ case Intrinsic::amdgcn_add_max_u32:
+ case Intrinsic::amdgcn_add_min_i32:
+ case Intrinsic::amdgcn_add_min_u32:
+ case Intrinsic::amdgcn_pk_add_max_i16:
+ case Intrinsic::amdgcn_pk_add_max_u16:
+ case Intrinsic::amdgcn_pk_add_min_i16:
+ case Intrinsic::amdgcn_pk_add_min_u16:
return getDefaultMappingVOP(MI);
case Intrinsic::amdgcn_log:
case Intrinsic::amdgcn_exp2:
@@ -5073,17 +5104,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
unsigned MinNumRegsRequired = DstSize / 32;
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ bool UseAGPRForm = Info->selectAGPRFormMFMA(MinNumRegsRequired);
+
OpdsMapping[0] =
- Info->getMinNumAGPRs() >= MinNumRegsRequired
- ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
- : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+ UseAGPRForm ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
+ : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
OpdsMapping[4] =
- Info->getMinNumAGPRs() >= MinNumRegsRequired
- ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
- : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+ UseAGPRForm ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
+ : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
OpdsMapping[8] = getVGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI);
OpdsMapping[10] = getVGPROpMapping(MI.getOperand(10).getReg(), MRI, *TRI);
@@ -5209,11 +5240,15 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
break;
}
case Intrinsic::amdgcn_wave_reduce_add:
+ case Intrinsic::amdgcn_wave_reduce_fadd:
case Intrinsic::amdgcn_wave_reduce_sub:
+ case Intrinsic::amdgcn_wave_reduce_fsub:
case Intrinsic::amdgcn_wave_reduce_min:
case Intrinsic::amdgcn_wave_reduce_umin:
+ case Intrinsic::amdgcn_wave_reduce_fmin:
case Intrinsic::amdgcn_wave_reduce_max:
case Intrinsic::amdgcn_wave_reduce_umax:
+ case Intrinsic::amdgcn_wave_reduce_fmax:
case Intrinsic::amdgcn_wave_reduce_and:
case Intrinsic::amdgcn_wave_reduce_or:
case Intrinsic::amdgcn_wave_reduce_xor: {
@@ -5225,11 +5260,20 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[2] = AMDGPU::getValueMapping(regBankID, OpSize);
break;
}
- case Intrinsic::amdgcn_s_bitreplicate:
+ case Intrinsic::amdgcn_s_bitreplicate: {
Register MaskReg = MI.getOperand(2).getReg();
unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64);
OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, 32);
+ break;
+ }
+ case Intrinsic::amdgcn_wave_shuffle: {
+ unsigned OpSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
+ OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
+ OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
+ break;
+ }
}
break;
}
@@ -5296,12 +5340,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
break;
}
- case Intrinsic::amdgcn_global_atomic_csub:
case Intrinsic::amdgcn_global_atomic_fmin_num:
case Intrinsic::amdgcn_global_atomic_fmax_num:
case Intrinsic::amdgcn_flat_atomic_fmin_num:
case Intrinsic::amdgcn_flat_atomic_fmax_num:
- case Intrinsic::amdgcn_atomic_cond_sub_u32:
case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
case Intrinsic::amdgcn_global_load_tr_b64:
case Intrinsic::amdgcn_global_load_tr_b128:
@@ -5311,12 +5353,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_ds_load_tr16_b128:
case Intrinsic::amdgcn_ds_load_tr4_b64:
case Intrinsic::amdgcn_ds_load_tr6_b96:
- case Intrinsic::amdgcn_flat_load_monitor_b32:
- case Intrinsic::amdgcn_flat_load_monitor_b64:
- case Intrinsic::amdgcn_flat_load_monitor_b128:
- case Intrinsic::amdgcn_global_load_monitor_b32:
- case Intrinsic::amdgcn_global_load_monitor_b64:
- case Intrinsic::amdgcn_global_load_monitor_b128:
case Intrinsic::amdgcn_ds_read_tr4_b64:
case Intrinsic::amdgcn_ds_read_tr6_b96:
case Intrinsic::amdgcn_ds_read_tr8_b64:
@@ -5359,6 +5395,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
OpdsMapping[8] = getSGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI);
break;
+ case Intrinsic::amdgcn_s_alloc_vgpr:
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1);
+ OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
+ break;
case Intrinsic::amdgcn_s_sendmsg:
case Intrinsic::amdgcn_s_sendmsghalt: {
// This must be an SGPR, but accept a VGPR.
@@ -5418,7 +5458,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
break;
}
case Intrinsic::amdgcn_raw_buffer_load_lds:
- case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: {
+ case Intrinsic::amdgcn_raw_buffer_load_async_lds:
+ case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
+ case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds: {
OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
@@ -5451,7 +5493,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
break;
}
case Intrinsic::amdgcn_struct_buffer_load_lds:
- case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
+ case Intrinsic::amdgcn_struct_buffer_load_async_lds:
+ case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
+ case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
@@ -5570,6 +5614,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
break;
case Intrinsic::amdgcn_s_barrier_join:
+ case Intrinsic::amdgcn_s_wakeup_barrier:
OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
break;
case Intrinsic::amdgcn_s_barrier_init:
@@ -5696,6 +5741,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_ATOMICRMW_FMAX:
case AMDGPU::G_ATOMICRMW_UINC_WRAP:
case AMDGPU::G_ATOMICRMW_UDEC_WRAP:
+ case AMDGPU::G_ATOMICRMW_USUB_COND:
+ case AMDGPU::G_ATOMICRMW_USUB_SAT:
case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: {
OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
@@ -5728,6 +5775,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN:
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
break;
+ case AMDGPU::G_AMDGPU_FLAT_LOAD_MONITOR:
+ case AMDGPU::G_AMDGPU_GLOBAL_LOAD_MONITOR: {
+ unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+ unsigned PtrSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+ OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
+ break;
+ }
}
return getInstructionMapping(/*ID*/1, /*Cost*/1,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
index 1c1a6da..c37d309 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
@@ -15,7 +15,7 @@ def VGPRRegBank : RegisterBank<"VGPR",
>;
// It is helpful to distinguish conditions from ordinary SGPRs.
-def VCCRegBank : RegisterBank <"VCC", [SReg_1]>;
+def VCCRegBank : RegisterBank<"VCC", [SReg_32, SReg_64]>;
def AGPRRegBank : RegisterBank <"AGPR",
[AGPR_LO16, AGPR_32, AReg_64, AReg_96, AReg_128, AReg_160, AReg_192, AReg_224, AReg_256, AReg_288, AReg_320, AReg_352, AReg_384, AReg_512, AReg_1024]
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index 0ea9add..4e664e0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -256,17 +256,13 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage(
// Pseudo used just to encode the underlying global. Is there a better
// way to track this?
+ // TODO: Some of the generic call-like pseudos do not encode the callee,
+ // so we overly conservatively treat this as an indirect call.
const MachineOperand *CalleeOp =
TII->getNamedOperand(MI, AMDGPU::OpName::callee);
- const Function *Callee = getCalleeFunction(*CalleeOp);
-
- // Avoid crashing on undefined behavior with an illegal call to a
- // kernel. If a callsite's calling convention doesn't match the
- // function's, it's undefined behavior. If the callsite calling
- // convention does match, that would have errored earlier.
- if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
- report_fatal_error("invalid call to entry function");
+ const Function *Callee =
+ CalleeOp ? getCalleeFunction(*CalleeOp) : nullptr;
auto isSameFunction = [](const MachineFunction &MF, const Function *F) {
return F == &MF.getFunction();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
index 89c16da..7a5db42 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
@@ -32,6 +32,7 @@
#include "llvm/CodeGen/LiveStacks.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/SlotIndexes.h"
#include "llvm/CodeGen/VirtRegMap.h"
#include "llvm/InitializePasses.h"
@@ -96,8 +97,8 @@ public:
/// Compute the register class constraints based on the uses of \p Reg,
/// excluding MFMA uses from which can be rewritten to change the register
- /// class constraint. This should be nearly identical to
- /// MachineRegisterInfo::recomputeRegClass.
+ /// class constraint. MFMA scale operands need to be constraint checked.
+ /// This should be nearly identical to MachineRegisterInfo::recomputeRegClass.
/// \p RewriteCandidates will collect the set of MFMA instructions that need
/// to have the opcode mutated to perform the replacement.
@@ -151,9 +152,16 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable(
// We can swap the classes of dst + src2 as a pair to AGPR, so ignore the
// effects of rewrite candidates. It just so happens that we can use
- // either AGPR or VGPR in src0/src1, so don't bother checking the
- // constraint effects of the individual operands.
+ // either AGPR or VGPR in src0/src1. We still need to check constraint
+ // effects for scale variant, which does not allow AGPR.
if (isRewriteCandidate(*MI)) {
+ int AGPROp = AMDGPU::getMFMASrcCVDstAGPROp(MI->getOpcode());
+ const MCInstrDesc &AGPRDesc = TII.get(AGPROp);
+ const TargetRegisterClass *NewRC =
+ TII.getRegClass(AGPRDesc, MO.getOperandNo());
+ if (!TRI.hasAGPRs(NewRC))
+ return false;
+
const MachineOperand *VDst =
TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
const MachineOperand *Src2 =
@@ -587,10 +595,7 @@ public:
static char ID;
RegisterClassInfo RegClassInfo;
- AMDGPURewriteAGPRCopyMFMALegacy() : MachineFunctionPass(ID) {
- initializeAMDGPURewriteAGPRCopyMFMALegacyPass(
- *PassRegistry::getPassRegistry());
- }
+ AMDGPURewriteAGPRCopyMFMALegacy() : MachineFunctionPass(ID) {}
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -659,7 +664,11 @@ AMDGPURewriteAGPRCopyMFMAPass::run(MachineFunction &MF,
if (!Impl.run(MF))
return PreservedAnalyses::all();
auto PA = getMachineFunctionPassPreservedAnalyses();
- PA.preserveSet<CFGAnalyses>();
- PA.preserve<LiveStacksAnalysis>();
+ PA.preserveSet<CFGAnalyses>()
+ .preserve<LiveStacksAnalysis>()
+ .preserve<VirtRegMapAnalysis>()
+ .preserve<SlotIndexesAnalysis>()
+ .preserve<LiveIntervalsAnalysis>()
+ .preserve<LiveRegMatrixAnalysis>();
return PA;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
index 4b1f80c..a2e16c7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
@@ -299,7 +299,7 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
if (Replacements.empty())
return false;
- LLVMContext &Ctx = F.getParent()->getContext();
+ LLVMContext &Ctx = F.getContext();
StructType *NewRetTy = StructType::create(Ctx, ReturnTypes, F.getName());
FunctionType *NewFuncTy = FunctionType::get(NewRetTy,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 2393346..963bb91 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -237,8 +237,6 @@ def : SourceOfDivergence<int_amdgcn_mbcnt_lo>;
def : SourceOfDivergence<int_r600_read_tidig_x>;
def : SourceOfDivergence<int_r600_read_tidig_y>;
def : SourceOfDivergence<int_r600_read_tidig_z>;
-def : SourceOfDivergence<int_amdgcn_atomic_cond_sub_u32>;
-def : SourceOfDivergence<int_amdgcn_global_atomic_csub>;
def : SourceOfDivergence<int_amdgcn_global_atomic_fmin_num>;
def : SourceOfDivergence<int_amdgcn_global_atomic_fmax_num>;
def : SourceOfDivergence<int_amdgcn_global_atomic_ordered_add_b64>;
@@ -409,7 +407,17 @@ def : AlwaysUniform<int_amdgcn_cluster_workgroup_max_flat_id>;
def : AlwaysUniform<int_amdgcn_workgroup_id_x>;
def : AlwaysUniform<int_amdgcn_workgroup_id_y>;
def : AlwaysUniform<int_amdgcn_workgroup_id_z>;
+def : AlwaysUniform<int_amdgcn_s_alloc_vgpr>;
def : AlwaysUniform<int_amdgcn_s_getpc>;
def : AlwaysUniform<int_amdgcn_s_getreg>;
def : AlwaysUniform<int_amdgcn_s_memrealtime>;
def : AlwaysUniform<int_amdgcn_s_memtime>;
+
+def AMDGPUImageDMaskIntrinsicTable : GenericTable {
+ let FilterClass = "AMDGPUImageDMaskIntrinsic";
+ let Fields = ["Intr"];
+
+ let PrimaryKey = ["Intr"];
+ let PrimaryKeyName = "getAMDGPUImageDMaskIntrinsic";
+ let PrimaryKeyEarlyOut = 1;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.cpp
index 2941a48..5b8ee5f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.cpp
@@ -7,13 +7,53 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUSelectionDAGInfo.h"
-#include "AMDGPUISelLowering.h"
+
+#define GET_SDNODE_DESC
+#include "AMDGPUGenSDNodeInfo.inc"
using namespace llvm;
+AMDGPUSelectionDAGInfo::AMDGPUSelectionDAGInfo()
+ : SelectionDAGGenTargetInfo(AMDGPUGenSDNodeInfo) {}
+
AMDGPUSelectionDAGInfo::~AMDGPUSelectionDAGInfo() = default;
-bool AMDGPUSelectionDAGInfo::isTargetMemoryOpcode(unsigned Opcode) const {
- return Opcode >= AMDGPUISD::FIRST_MEMORY_OPCODE &&
- Opcode <= AMDGPUISD::LAST_MEMORY_OPCODE;
+const char *AMDGPUSelectionDAGInfo::getTargetNodeName(unsigned Opcode) const {
+#define NODE_NAME_CASE(node) \
+ case AMDGPUISD::node: \
+ return "AMDGPUISD::" #node;
+
+ switch (static_cast<AMDGPUISD::NodeType>(Opcode)) {
+ // These nodes don't have corresponding entries in *.td files yet.
+ NODE_NAME_CASE(WAVE_ADDRESS)
+ NODE_NAME_CASE(MAD_I64_I32)
+ NODE_NAME_CASE(MAD_U64_U32)
+ NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
+ // These do, but only when compiling R600.td,
+ // and the enum is generated from AMDGPU.td.
+ NODE_NAME_CASE(DOT4)
+ NODE_NAME_CASE(TEXTURE_FETCH)
+ NODE_NAME_CASE(R600_EXPORT)
+ NODE_NAME_CASE(CONST_ADDRESS)
+ NODE_NAME_CASE(DUMMY_CHAIN)
+ }
+
+#undef NODE_NAME_CASE
+
+ return SelectionDAGGenTargetInfo::getTargetNodeName(Opcode);
+}
+
+void AMDGPUSelectionDAGInfo::verifyTargetNode(const SelectionDAG &DAG,
+ const SDNode *N) const {
+ switch (N->getOpcode()) {
+ case AMDGPUISD::IF:
+ // result #0 must have type i1, but has type i32/i64
+ case AMDGPUISD::ELSE:
+ case AMDGPUISD::LOOP:
+ // operand #1 must have type i1, but has type i32/i64
+ case AMDGPUISD::LDS:
+ // result #0 must have type i64 (iPTR), but has type i32
+ return;
+ }
+ SelectionDAGGenTargetInfo::verifyTargetNode(DAG, N);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.h
index 3280be7..bae614a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.h
@@ -11,13 +11,49 @@
#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#define GET_SDNODE_ENUM
+#include "AMDGPUGenSDNodeInfo.inc"
+
namespace llvm {
+namespace AMDGPUISD {
+
+enum NodeType : unsigned {
+ // Convert a unswizzled wave uniform stack address to an address compatible
+ // with a vector offset for use in stack access.
+ WAVE_ADDRESS = GENERATED_OPCODE_END,
+
+ DOT4,
+ MAD_U64_U32,
+ MAD_I64_I32,
+ TEXTURE_FETCH,
+ R600_EXPORT,
+ CONST_ADDRESS,
-class AMDGPUSelectionDAGInfo : public SelectionDAGTargetInfo {
+ /// This node is for VLIW targets and it is used to represent a vector
+ /// that is stored in consecutive registers with the same channel.
+ /// For example:
+ /// |X |Y|Z|W|
+ /// T0|v.x| | | |
+ /// T1|v.y| | | |
+ /// T2|v.z| | | |
+ /// T3|v.w| | | |
+ BUILD_VERTICAL_VECTOR,
+
+ DUMMY_CHAIN,
+};
+
+} // namespace AMDGPUISD
+
+class AMDGPUSelectionDAGInfo : public SelectionDAGGenTargetInfo {
public:
+ AMDGPUSelectionDAGInfo();
+
~AMDGPUSelectionDAGInfo() override;
- bool isTargetMemoryOpcode(unsigned Opcode) const override;
+ const char *getTargetNodeName(unsigned Opcode) const override;
+
+ void verifyTargetNode(const SelectionDAG &DAG,
+ const SDNode *N) const override;
};
} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
index 9af8129..d04dc3e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
@@ -314,9 +314,7 @@ public:
#endif
bool empty() const { return Nodes.empty(); }
- const iterator_range<nodes_iterator> nodes() const {
- return {Nodes.begin(), Nodes.end()};
- }
+ iterator_range<nodes_iterator> nodes() const { return Nodes; }
const Node &getNode(unsigned ID) const { return *Nodes[ID]; }
unsigned getNumNodes() const { return Nodes.size(); }
@@ -993,7 +991,7 @@ void RecursiveSearchSplitting::run() {
{
SplitModuleTimer SMT("recursive_search_pick", "partitioning");
SplitProposal SP(SG, NumParts);
- pickPartition(/*BranchDepth=*/0, /*Idx=*/0, SP);
+ pickPartition(/*BranchDepth=*/0, /*Idx=*/0, std::move(SP));
}
}
@@ -1140,7 +1138,7 @@ void RecursiveSearchSplitting::pickPartition(unsigned Depth, unsigned Idx,
LLVM_DEBUG(dbgs().indent(Depth)
<< " [lb] " << Idx << "=P" << CheapestPID << "? ");
BranchSP.add(CheapestPID, Cluster);
- pickPartition(Depth + 1, Idx + 1, BranchSP);
+ pickPartition(Depth + 1, Idx + 1, std::move(BranchSP));
}
// ms = most similar = put in partition with the most in common
@@ -1149,7 +1147,7 @@ void RecursiveSearchSplitting::pickPartition(unsigned Depth, unsigned Idx,
LLVM_DEBUG(dbgs().indent(Depth)
<< " [ms] " << Idx << "=P" << MostSimilarPID << "? ");
BranchSP.add(MostSimilarPID, Cluster);
- pickPartition(Depth + 1, Idx + 1, BranchSP);
+ pickPartition(Depth + 1, Idx + 1, std::move(BranchSP));
}
return;
@@ -1163,7 +1161,7 @@ void RecursiveSearchSplitting::pickPartition(unsigned Depth, unsigned Idx,
SP.setName("recursive_search (depth=" + std::to_string(Depth) + ") #" +
std::to_string(NumProposalsSubmitted++));
LLVM_DEBUG(dbgs() << '\n');
- SubmitProposal(SP);
+ SubmitProposal(std::move(SP));
}
std::pair<unsigned, CostType>
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 26e0b3df..300aca1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -32,16 +32,6 @@ using namespace llvm;
#define DEBUG_TYPE "amdgpu-subtarget"
-AMDGPUSubtarget::AMDGPUSubtarget(Triple TT) : TargetTriple(std::move(TT)) {}
-
-bool AMDGPUSubtarget::useRealTrue16Insts() const {
- return hasTrue16BitInsts() && EnableRealTrue16Insts;
-}
-
-bool AMDGPUSubtarget::hasD16Writes32BitVgpr() const {
- return EnableD16Writes32BitVgpr;
-}
-
// Returns the maximum per-workgroup LDS allocation size (in bytes) that still
// allows the given function to achieve an occupancy of NWaves waves per
// SIMD / EU, taking into account only the function's *maximum* workgroup size.
@@ -282,7 +272,7 @@ bool AMDGPUSubtarget::isSingleLaneExecution(const Function &Func) const {
}
bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
- Function *Kernel = I->getParent()->getParent();
+ Function *Kernel = I->getFunction();
unsigned MinSize = 0;
unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
bool IdQuery = false;
@@ -350,7 +340,7 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
}
unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
- assert(AMDGPU::isKernel(F.getCallingConv()));
+ assert(AMDGPU::isKernel(F));
// We don't allocate the segment if we know the implicit arguments weren't
// used, even if the ABI implies we need them.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index ed03ef2..302fe7c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -42,40 +42,18 @@ public:
GFX10 = 9,
GFX11 = 10,
GFX12 = 11,
+ GFX13 = 12,
};
private:
Triple TargetTriple;
protected:
- bool GCN3Encoding = false;
- bool Has16BitInsts = false;
- bool HasTrue16BitInsts = false;
- bool HasFP8ConversionScaleInsts = false;
- bool HasBF8ConversionScaleInsts = false;
- bool HasFP4ConversionScaleInsts = false;
- bool HasFP6BF6ConversionScaleInsts = false;
- bool HasF16BF16ToFP6BF6ConversionScaleInsts = false;
- bool HasCvtPkF16F32Inst = false;
- bool HasF32ToF16BF16ConversionSRInsts = false;
- bool EnableRealTrue16Insts = false;
- bool EnableD16Writes32BitVgpr = false;
- bool HasBF16TransInsts = false;
- bool HasBF16ConversionInsts = false;
- bool HasBF16PackedInsts = false;
- bool HasMadMixInsts = false;
- bool HasMadMacF32Insts = false;
- bool HasDsSrc2Insts = false;
- bool HasSDWA = false;
- bool HasVOP3PInsts = false;
bool HasMulI24 = true;
bool HasMulU24 = true;
bool HasSMulHi = false;
- bool HasInv2PiInlineImm = false;
bool HasFminFmaxLegacy = true;
- bool EnablePromoteAlloca = false;
- bool HasTrigReducedRange = false;
- bool FastFMAF32 = false;
+
unsigned EUsPerCU = 4;
unsigned MaxWavesPerEU = 10;
unsigned LocalMemorySize = 0;
@@ -83,7 +61,7 @@ protected:
char WavefrontSizeLog2 = 0;
public:
- AMDGPUSubtarget(Triple TT);
+ AMDGPUSubtarget(Triple TT) : TargetTriple(std::move(TT)) {}
static const AMDGPUSubtarget &get(const MachineFunction &MF);
static const AMDGPUSubtarget &get(const TargetMachine &TM,
@@ -132,13 +110,6 @@ public:
/// size, register usage, and/or lds usage.
std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const;
- /// Overload which uses the specified values for the flat work group sizes,
- /// rather than querying the function itself. \p FlatWorkGroupSizes Should
- /// correspond to the function's value for getFlatWorkGroupSizes.
- std::pair<unsigned, unsigned>
- getWavesPerEU(const Function &F,
- std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;
-
/// Overload which uses the specified values for the flat workgroup sizes and
/// LDS space rather than querying the function itself. \p FlatWorkGroupSizes
/// should correspond to the function's value for getFlatWorkGroupSizes and \p
@@ -206,16 +177,13 @@ public:
bool isGCN() const { return TargetTriple.isAMDGCN(); }
- bool isGCN3Encoding() const {
- return GCN3Encoding;
- }
-
- bool has16BitInsts() const {
- return Has16BitInsts;
- }
-
- /// Return true if the subtarget supports True16 instructions.
- bool hasTrue16BitInsts() const { return HasTrue16BitInsts; }
+ //==---------------------------------------------------------------------===//
+ // TableGen-generated feature getters.
+ //==---------------------------------------------------------------------===//
+#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
+ virtual bool GETTER() const { return false; }
+#include "AMDGPUGenSubtargetInfo.inc"
+ //==---------------------------------------------------------------------===//
/// Return true if real (non-fake) variants of True16 instructions using
/// 16-bit registers should be code-generated. Fake True16 instructions are
@@ -223,56 +191,8 @@ public:
/// operands and always use their low halves.
// TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully
// supported and the support for fake True16 instructions is removed.
- bool useRealTrue16Insts() const;
-
- bool hasD16Writes32BitVgpr() const;
-
- bool hasBF16TransInsts() const { return HasBF16TransInsts; }
-
- bool hasBF16ConversionInsts() const {
- return HasBF16ConversionInsts;
- }
-
- bool hasBF16PackedInsts() const { return HasBF16PackedInsts; }
-
- bool hasMadMixInsts() const {
- return HasMadMixInsts;
- }
-
- bool hasFP8ConversionScaleInsts() const { return HasFP8ConversionScaleInsts; }
-
- bool hasBF8ConversionScaleInsts() const { return HasBF8ConversionScaleInsts; }
-
- bool hasFP4ConversionScaleInsts() const { return HasFP4ConversionScaleInsts; }
-
- bool hasFP6BF6ConversionScaleInsts() const {
- return HasFP6BF6ConversionScaleInsts;
- }
-
- bool hasF16BF16ToFP6BF6ConversionScaleInsts() const {
- return HasF16BF16ToFP6BF6ConversionScaleInsts;
- }
-
- bool hasCvtPkF16F32Inst() const { return HasCvtPkF16F32Inst; }
-
- bool hasF32ToF16BF16ConversionSRInsts() const {
- return HasF32ToF16BF16ConversionSRInsts;
- }
-
- bool hasMadMacF32Insts() const {
- return HasMadMacF32Insts || !isGCN();
- }
-
- bool hasDsSrc2Insts() const {
- return HasDsSrc2Insts;
- }
-
- bool hasSDWA() const {
- return HasSDWA;
- }
-
- bool hasVOP3PInsts() const {
- return HasVOP3PInsts;
+ bool useRealTrue16Insts() const {
+ return hasTrue16BitInsts() && enableRealTrue16Insts();
}
bool hasMulI24() const {
@@ -287,26 +207,10 @@ public:
return HasSMulHi;
}
- bool hasInv2PiInlineImm() const {
- return HasInv2PiInlineImm;
- }
-
bool hasFminFmaxLegacy() const {
return HasFminFmaxLegacy;
}
- bool hasTrigReducedRange() const {
- return HasTrigReducedRange;
- }
-
- bool hasFastFMAF32() const {
- return FastFMAF32;
- }
-
- bool isPromoteAllocaEnabled() const {
- return EnablePromoteAlloca;
- }
-
unsigned getWavefrontSize() const {
return 1 << WavefrontSizeLog2;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
index 4a9437b..3fd554a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
@@ -47,8 +47,8 @@
// corresponds to offset, second member corresponds to size of LDS global
// being replaced and third represents the total aligned size. It will
// have name "llvm.amdgcn.sw.lds.<kernel-name>.md". This global will have
-// an intializer with static LDS related offsets and sizes initialized.
-// But for dynamic LDS related entries, offsets will be intialized to
+// an initializer with static LDS related offsets and sizes initialized.
+// But for dynamic LDS related entries, offsets will be initialized to
// previous static LDS allocation end offset. Sizes for them will be zero
// initially. These dynamic LDS offset and size values will be updated
// within the kernel, since kernel can read the dynamic LDS size
@@ -271,7 +271,7 @@ void AMDGPUSwLowerLDS::getNonKernelsWithLDSArguments(const CallGraph &CG) {
Function *CalledFunc = CallerCGN->getFunction();
if (!CalledFunc || CalledFunc->isDeclaration())
continue;
- if (AMDGPU::isKernelLDS(CalledFunc))
+ if (AMDGPU::isKernel(*CalledFunc))
continue;
for (auto AI = CalledFunc->arg_begin(), E = CalledFunc->arg_end();
AI != E; ++AI) {
@@ -297,7 +297,7 @@ void AMDGPUSwLowerLDS::getUsesOfLDSByNonKernels() {
for (User *V : GV->users()) {
if (auto *I = dyn_cast<Instruction>(V)) {
Function *F = I->getFunction();
- if (!isKernelLDS(F) && !F->isDeclaration())
+ if (!isKernel(*F) && !F->isDeclaration())
FuncLDSAccessInfo.NonKernelToLDSAccessMap[F].insert(GV);
}
}
@@ -523,7 +523,7 @@ static void replacesUsesOfGlobalInFunction(Function *Func, GlobalVariable *GV,
auto ReplaceUsesLambda = [Func](const Use &U) -> bool {
auto *V = U.getUser();
if (auto *Inst = dyn_cast<Instruction>(V)) {
- auto *Func1 = Inst->getParent()->getParent();
+ auto *Func1 = Inst->getFunction();
if (Func == Func1)
return true;
}
@@ -1169,7 +1169,7 @@ bool AMDGPUSwLowerLDS::run() {
if (!F || K.second.empty())
continue;
- assert(isKernelLDS(F));
+ assert(isKernel(*F));
// Only inserts if key isn't already in the map.
FuncLDSAccessInfo.KernelToLDSParametersMap.insert(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 4958a20..49c60c2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -17,9 +17,12 @@
#include "AMDGPUTargetMachine.h"
#include "AMDGPU.h"
#include "AMDGPUAliasAnalysis.h"
+#include "AMDGPUArgumentUsageInfo.h"
+#include "AMDGPUBarrierLatency.h"
#include "AMDGPUCtorDtorLowering.h"
#include "AMDGPUExportClustering.h"
#include "AMDGPUExportKernelRuntimeHandles.h"
+#include "AMDGPUHazardLatency.h"
#include "AMDGPUIGroupLP.h"
#include "AMDGPUISelDAGToDAG.h"
#include "AMDGPULowerVGPREncoding.h"
@@ -72,6 +75,7 @@
#include "llvm/CodeGen/AtomicExpand.h"
#include "llvm/CodeGen/BranchRelaxation.h"
#include "llvm/CodeGen/DeadMachineInstructionElim.h"
+#include "llvm/CodeGen/EarlyIfConversion.h"
#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
@@ -140,29 +144,36 @@ public:
const CGPassBuilderOption &Opts,
PassInstrumentationCallbacks *PIC);
- void addIRPasses(AddIRPass &) const;
- void addCodeGenPrepare(AddIRPass &) const;
- void addPreISel(AddIRPass &addPass) const;
- void addILPOpts(AddMachinePass &) const;
- void addAsmPrinter(AddMachinePass &, CreateMCStreamer) const;
- Error addInstSelector(AddMachinePass &) const;
- void addPreRewrite(AddMachinePass &) const;
- void addMachineSSAOptimization(AddMachinePass &) const;
- void addPostRegAlloc(AddMachinePass &) const;
- void addPreEmitPass(AddMachinePass &) const;
- void addPreEmitRegAlloc(AddMachinePass &) const;
- Error addRegAssignmentOptimized(AddMachinePass &) const;
- void addPreRegAlloc(AddMachinePass &) const;
- void addOptimizedRegAlloc(AddMachinePass &) const;
- void addPreSched2(AddMachinePass &) const;
+ void addIRPasses(PassManagerWrapper &PMW) const;
+ void addCodeGenPrepare(PassManagerWrapper &PMW) const;
+ void addPreISel(PassManagerWrapper &PMW) const;
+ void addILPOpts(PassManagerWrapper &PMWM) const;
+ void addAsmPrinter(PassManagerWrapper &PMW, CreateMCStreamer) const;
+ Error addInstSelector(PassManagerWrapper &PMW) const;
+ void addPreRewrite(PassManagerWrapper &PMW) const;
+ void addMachineSSAOptimization(PassManagerWrapper &PMW) const;
+ void addPostRegAlloc(PassManagerWrapper &PMW) const;
+ void addPreEmitPass(PassManagerWrapper &PMWM) const;
+ void addPreEmitRegAlloc(PassManagerWrapper &PMW) const;
+ Error addRegAssignmentFast(PassManagerWrapper &PMW) const;
+ Error addRegAssignmentOptimized(PassManagerWrapper &PMW) const;
+ void addPreRegAlloc(PassManagerWrapper &PMW) const;
+ Error addFastRegAlloc(PassManagerWrapper &PMW) const;
+ Error addOptimizedRegAlloc(PassManagerWrapper &PMW) const;
+ void addPreSched2(PassManagerWrapper &PMW) const;
+ void addPostBBSections(PassManagerWrapper &PMW) const;
+
+private:
+ Error validateRegAllocOptions() const;
+public:
/// Check if a pass is enabled given \p Opt option. The option always
/// overrides defaults if explicitly used. Otherwise its default will be used
/// given that a pass shall work at an optimization \p Level minimum.
bool isPassEnabled(const cl::opt<bool> &Opt,
CodeGenOptLevel Level = CodeGenOptLevel::Default) const;
- void addEarlyCSEOrGVNPass(AddIRPass &) const;
- void addStraightLineScalarOptimizationPasses(AddIRPass &) const;
+ void addEarlyCSEOrGVNPass(PassManagerWrapper &PMW) const;
+ void addStraightLineScalarOptimizationPasses(PassManagerWrapper &PMW) const;
};
class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> {
@@ -237,6 +248,63 @@ static cl::opt<WWMRegisterRegAlloc::FunctionPassCtor, false,
cl::init(&useDefaultRegisterAllocator),
cl::desc("Register allocator to use for WWM registers"));
+// New pass manager register allocator options for AMDGPU
+static cl::opt<RegAllocType, false, RegAllocTypeParser> SGPRRegAllocNPM(
+ "sgpr-regalloc-npm", cl::Hidden, cl::init(RegAllocType::Default),
+ cl::desc("Register allocator for SGPRs (new pass manager)"));
+
+static cl::opt<RegAllocType, false, RegAllocTypeParser> VGPRRegAllocNPM(
+ "vgpr-regalloc-npm", cl::Hidden, cl::init(RegAllocType::Default),
+ cl::desc("Register allocator for VGPRs (new pass manager)"));
+
+static cl::opt<RegAllocType, false, RegAllocTypeParser> WWMRegAllocNPM(
+ "wwm-regalloc-npm", cl::Hidden, cl::init(RegAllocType::Default),
+ cl::desc("Register allocator for WWM registers (new pass manager)"));
+
+/// Check if the given RegAllocType is supported for AMDGPU NPM register
+/// allocation. Only Fast and Greedy are supported; Basic and PBQP are not.
+static Error checkRegAllocSupported(RegAllocType RAType, StringRef RegName) {
+ if (RAType == RegAllocType::Basic || RAType == RegAllocType::PBQP) {
+ return make_error<StringError>(
+ Twine("unsupported register allocator '") +
+ (RAType == RegAllocType::Basic ? "basic" : "pbqp") + "' for " +
+ RegName + " registers",
+ inconvertibleErrorCode());
+ }
+ return Error::success();
+}
+
+Error AMDGPUCodeGenPassBuilder::validateRegAllocOptions() const {
+ // 1. Generic --regalloc-npm is not supported for AMDGPU.
+ if (Opt.RegAlloc != RegAllocType::Unset) {
+ return make_error<StringError>(
+ "-regalloc-npm not supported for amdgcn. Use -sgpr-regalloc-npm, "
+ "-vgpr-regalloc-npm, and -wwm-regalloc-npm",
+ inconvertibleErrorCode());
+ }
+
+ // 2. Legacy PM regalloc options are not compatible with NPM.
+ if (SGPRRegAlloc.getNumOccurrences() > 0 ||
+ VGPRRegAlloc.getNumOccurrences() > 0 ||
+ WWMRegAlloc.getNumOccurrences() > 0) {
+ return make_error<StringError>(
+ "-sgpr-regalloc, -vgpr-regalloc, and -wwm-regalloc are legacy PM "
+ "options. Use -sgpr-regalloc-npm, -vgpr-regalloc-npm, and "
+ "-wwm-regalloc-npm with the new pass manager",
+ inconvertibleErrorCode());
+ }
+
+ // 3. Only Fast and Greedy allocators are supported for AMDGPU.
+ if (auto Err = checkRegAllocSupported(SGPRRegAllocNPM, "SGPR"))
+ return Err;
+ if (auto Err = checkRegAllocSupported(WWMRegAllocNPM, "WWM"))
+ return Err;
+ if (auto Err = checkRegAllocSupported(VGPRRegAllocNPM, "VGPR"))
+ return Err;
+
+ return Error::success();
+}
+
static void initializeDefaultSGPRRegisterAllocatorOnce() {
RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
@@ -464,6 +532,11 @@ static cl::opt<bool> EnableScalarIRPasses(
cl::init(true),
cl::Hidden);
+static cl::opt<bool> EnableLowerExecSync(
+ "amdgpu-enable-lower-exec-sync",
+ cl::desc("Enable lowering of execution synchronization."), cl::init(true),
+ cl::Hidden);
+
static cl::opt<bool>
EnableSwLowerLDS("amdgpu-enable-sw-lower-lds",
cl::desc("Enable lowering of lds to global memory pass "
@@ -566,9 +639,10 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeSILoadStoreOptimizerLegacyPass(*PR);
initializeAMDGPUCtorDtorLoweringLegacyPass(*PR);
initializeAMDGPUAlwaysInlinePass(*PR);
+ initializeAMDGPULowerExecSyncLegacyPass(*PR);
initializeAMDGPUSwLowerLDSLegacyPass(*PR);
initializeAMDGPUAnnotateUniformValuesLegacyPass(*PR);
- initializeAMDGPUArgumentUsageInfoPass(*PR);
+ initializeAMDGPUArgumentUsageInfoWrapperLegacyPass(*PR);
initializeAMDGPUAtomicOptimizerPass(*PR);
initializeAMDGPULowerKernelArgumentsPass(*PR);
initializeAMDGPUPromoteKernelArgumentsPass(*PR);
@@ -618,6 +692,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUPreloadKernArgPrologLegacyPass(*PR);
initializeAMDGPUWaitSGPRHazardsLegacyPass(*PR);
initializeAMDGPUPreloadKernelArgumentsLegacyPass(*PR);
+ initializeAMDGPUUniformIntrinsicCombineLegacyPass(*PR);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -639,6 +714,8 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
+ DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation(C->MF));
+ DAG->addMutation(createAMDGPUHazardLatencyDAGMutation(C->MF));
return DAG;
}
@@ -659,6 +736,8 @@ createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C) {
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
+ DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation(C->MF));
+ DAG->addMutation(createAMDGPUHazardLatencyDAGMutation(C->MF));
return DAG;
}
@@ -737,7 +816,7 @@ static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
return "r600";
}
-static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
+static Reloc::Model getEffectiveRelocModel() {
// The AMDGPU toolchain only supports generating shared objects, so we
// must always use PIC.
return Reloc::PIC_;
@@ -751,8 +830,8 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
CodeGenOptLevel OptLevel)
: CodeGenTargetMachineImpl(
T, TT.computeDataLayout(), TT, getGPUOrDefault(TT, CPU), FS, Options,
- getEffectiveRelocModel(RM),
- getEffectiveCodeModel(CM, CodeModel::Small), OptLevel),
+ getEffectiveRelocModel(), getEffectiveCodeModel(CM, CodeModel::Small),
+ OptLevel),
TLOF(createTLOF(getTargetTriple())) {
initAsmInfo();
if (TT.isAMDGCN()) {
@@ -802,7 +881,8 @@ static bool mustPreserveGV(const GlobalValue &GV) {
}
void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {
- AAM.registerFunctionAnalysis<AMDGPUAA>();
+ if (EnableAMDGPUAliasAnalysis)
+ AAM.registerFunctionAnalysis<AMDGPUAA>();
}
static Expected<ScanOptions>
@@ -812,7 +892,7 @@ parseAMDGPUAtomicOptimizerStrategy(StringRef Params) {
Params.consume_front("strategy=");
auto Result = StringSwitch<std::optional<ScanOptions>>(Params)
.Case("dpp", ScanOptions::DPP)
- .Cases("iterative", "", ScanOptions::Iterative)
+ .Cases({"iterative", ""}, ScanOptions::Iterative)
.Case("none", ScanOptions::None)
.Default(std::nullopt);
if (Result)
@@ -884,9 +964,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
if (EarlyInlineAll && !EnableFunctionCalls)
PM.addPass(AMDGPUAlwaysInlinePass());
-
- if (EnableUniformIntrinsicCombine)
- PM.addPass(AMDGPUUniformIntrinsicCombinePass());
});
PB.registerPeepholeEPCallback(
@@ -897,6 +974,9 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
FPM.addPass(AMDGPUUseNativeCallsPass());
if (EnableLibCallSimplify)
FPM.addPass(AMDGPUSimplifyLibCallsPass());
+
+ if (EnableUniformIntrinsicCombine)
+ FPM.addPass(AMDGPUUniformIntrinsicCombinePass());
});
PB.registerCGSCCOptimizerLateEPCallback(
@@ -958,6 +1038,8 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
// We want to support the -lto-partitions=N option as "best effort".
// For that, we need to lower LDS earlier in the pipeline before the
// module is partitioned for codegen.
+ if (EnableLowerExecSync)
+ PM.addPass(AMDGPULowerExecSyncPass());
if (EnableSwLowerLDS)
PM.addPass(AMDGPUSwLowerLDSPass(*this));
if (EnableLowerModuleLDS)
@@ -1197,6 +1279,8 @@ GCNTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const {
EnableVOPD)
DAG->addMutation(createVOPDPairingMutation());
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
+ DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation(C->MF));
+ DAG->addMutation(createAMDGPUHazardLatencyDAGMutation(C->MF));
return DAG;
}
//===----------------------------------------------------------------------===//
@@ -1213,10 +1297,6 @@ class GCNPassConfig final : public AMDGPUPassConfig {
public:
GCNPassConfig(TargetMachine &TM, PassManagerBase &PM)
: AMDGPUPassConfig(TM, PM) {
- // It is necessary to know the register usage of the entire call graph. We
- // allow calls without EnableAMDGPUFunctionCalls if they are marked
- // noinline, so this is always required.
- setRequiresCodeGenSCCOrder(true);
substitutePass(&PostRASchedulerID, &PostMachineSchedulerID);
}
@@ -1310,6 +1390,9 @@ void AMDGPUPassConfig::addIRPasses() {
isPassEnabled(EnableImageIntrinsicOptimizer))
addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM));
+ if (EnableUniformIntrinsicCombine)
+ addPass(createAMDGPUUniformIntrinsicCombineLegacyPass());
+
// This can be disabled by passing ::Disable here or on the command line
// with --expand-variadics-override=disable.
addPass(createExpandVariadicsPass(ExpandVariadicsMode::Lowering));
@@ -1325,6 +1408,10 @@ void AMDGPUPassConfig::addIRPasses() {
// Make enqueued block runtime handles externally visible.
addPass(createAMDGPUExportKernelRuntimeHandlesLegacyPass());
+ // Lower special LDS accesses.
+ if (EnableLowerExecSync)
+ addPass(createAMDGPULowerExecSyncLegacyPass());
+
// Lower LDS accesses to global memory pass if address sanitizer is enabled.
if (EnableSwLowerLDS)
addPass(createAMDGPUSwLowerLDSLegacyPass(&TM));
@@ -1410,9 +1497,6 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
// many cases.
addPass(createAMDGPULowerBufferFatPointersPass());
addPass(createAMDGPULowerIntrinsicsLegacyPass());
- // In accordance with the above FIXME, manually force all the
- // function-level passes into a CGSCCPassManager.
- addPass(new DummyCGSCCPass());
}
// LowerSwitch pass may introduce unreachable blocks that can
@@ -2007,6 +2091,42 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
MFI->ArgInfo.WorkItemIDZ, 0, 0)))
return true;
+ // Parse FirstKernArgPreloadReg separately, since it's a Register,
+ // not ArgDescriptor.
+ if (YamlMFI.ArgInfo && YamlMFI.ArgInfo->FirstKernArgPreloadReg) {
+ const yaml::SIArgument &A = *YamlMFI.ArgInfo->FirstKernArgPreloadReg;
+
+ if (!A.IsRegister) {
+ // For stack arguments, we don't have RegisterName.SourceRange,
+ // but we should have some location info from the YAML parser
+ const MemoryBuffer &Buffer =
+ *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
+ // Create a minimal valid source range
+ SMLoc Loc = SMLoc::getFromPointer(Buffer.getBufferStart());
+ SMRange Range(Loc, Loc);
+
+ Error = SMDiagnostic(
+ *PFS.SM, Loc, Buffer.getBufferIdentifier(), 1, 0, SourceMgr::DK_Error,
+ "firstKernArgPreloadReg must be a register, not a stack location", "",
+ {}, {});
+
+ SourceRange = Range;
+ return true;
+ }
+
+ Register Reg;
+ if (parseNamedRegisterReference(PFS, Reg, A.RegisterName.Value, Error)) {
+ SourceRange = A.RegisterName.SourceRange;
+ return true;
+ }
+
+ if (!AMDGPU::SGPR_32RegClass.contains(Reg))
+ return diagnoseRegisterClass(A.RegisterName);
+
+ MFI->ArgInfo.FirstKernArgPreloadReg = Reg;
+ MFI->NumUserSGPRs += YamlMFI.NumKernargPreloadSGPRs;
+ }
+
if (ST.hasIEEEMode())
MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
if (ST.hasDX10ClampMode())
@@ -2046,63 +2166,74 @@ AMDGPUCodeGenPassBuilder::AMDGPUCodeGenPassBuilder(
// Exceptions and StackMaps are not supported, so these passes will never do
// anything.
// Garbage collection is not supported.
- disablePass<StackMapLivenessPass, FuncletLayoutPass,
- ShadowStackGCLoweringPass>();
+ disablePass<StackMapLivenessPass, FuncletLayoutPass, PatchableFunctionPass,
+ ShadowStackGCLoweringPass, GCLoweringPass>();
}
-void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const {
- if (RemoveIncompatibleFunctions && TM.getTargetTriple().isAMDGCN())
- addPass(AMDGPURemoveIncompatibleFunctionsPass(TM));
+void AMDGPUCodeGenPassBuilder::addIRPasses(PassManagerWrapper &PMW) const {
+ if (RemoveIncompatibleFunctions && TM.getTargetTriple().isAMDGCN()) {
+ flushFPMsToMPM(PMW);
+ addModulePass(AMDGPURemoveIncompatibleFunctionsPass(TM), PMW);
+ }
- addPass(AMDGPUPrintfRuntimeBindingPass());
+ flushFPMsToMPM(PMW);
+ addModulePass(AMDGPUPrintfRuntimeBindingPass(), PMW);
if (LowerCtorDtor)
- addPass(AMDGPUCtorDtorLoweringPass());
+ addModulePass(AMDGPUCtorDtorLoweringPass(), PMW);
if (isPassEnabled(EnableImageIntrinsicOptimizer))
- addPass(AMDGPUImageIntrinsicOptimizerPass(TM));
+ addFunctionPass(AMDGPUImageIntrinsicOptimizerPass(TM), PMW);
+ if (EnableUniformIntrinsicCombine)
+ addFunctionPass(AMDGPUUniformIntrinsicCombinePass(), PMW);
// This can be disabled by passing ::Disable here or on the command line
// with --expand-variadics-override=disable.
- addPass(ExpandVariadicsPass(ExpandVariadicsMode::Lowering));
+ flushFPMsToMPM(PMW);
+ addModulePass(ExpandVariadicsPass(ExpandVariadicsMode::Lowering), PMW);
- addPass(AMDGPUAlwaysInlinePass());
- addPass(AlwaysInlinerPass());
+ addModulePass(AMDGPUAlwaysInlinePass(), PMW);
+ addModulePass(AlwaysInlinerPass(), PMW);
- addPass(AMDGPUExportKernelRuntimeHandlesPass());
+ addModulePass(AMDGPUExportKernelRuntimeHandlesPass(), PMW);
+
+ if (EnableLowerExecSync)
+ addModulePass(AMDGPULowerExecSyncPass(), PMW);
if (EnableSwLowerLDS)
- addPass(AMDGPUSwLowerLDSPass(TM));
+ addModulePass(AMDGPUSwLowerLDSPass(TM), PMW);
// Runs before PromoteAlloca so the latter can account for function uses
if (EnableLowerModuleLDS)
- addPass(AMDGPULowerModuleLDSPass(TM));
+ addModulePass(AMDGPULowerModuleLDSPass(TM), PMW);
// Run atomic optimizer before Atomic Expand
if (TM.getOptLevel() >= CodeGenOptLevel::Less &&
(AMDGPUAtomicOptimizerStrategy != ScanOptions::None))
- addPass(AMDGPUAtomicOptimizerPass(TM, AMDGPUAtomicOptimizerStrategy));
+ addFunctionPass(
+ AMDGPUAtomicOptimizerPass(TM, AMDGPUAtomicOptimizerStrategy), PMW);
- addPass(AtomicExpandPass(&TM));
+ addFunctionPass(AtomicExpandPass(TM), PMW);
if (TM.getOptLevel() > CodeGenOptLevel::None) {
- addPass(AMDGPUPromoteAllocaPass(TM));
+ addFunctionPass(AMDGPUPromoteAllocaPass(TM), PMW);
if (isPassEnabled(EnableScalarIRPasses))
- addStraightLineScalarOptimizationPasses(addPass);
+ addStraightLineScalarOptimizationPasses(PMW);
// TODO: Handle EnableAMDGPUAliasAnalysis
// TODO: May want to move later or split into an early and late one.
- addPass(AMDGPUCodeGenPreparePass(TM));
+ addFunctionPass(AMDGPUCodeGenPreparePass(TM), PMW);
// Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may
// have expanded.
if (TM.getOptLevel() > CodeGenOptLevel::Less) {
- addPass(createFunctionToLoopPassAdaptor(LICMPass(LICMOptions()),
- /*UseMemorySSA=*/true));
+ addFunctionPass(createFunctionToLoopPassAdaptor(LICMPass(LICMOptions()),
+ /*UseMemorySSA=*/true),
+ PMW);
}
}
- Base::addIRPasses(addPass);
+ Base::addIRPasses(PMW);
// EarlyCSE is not always strong enough to clean up what LSR produces. For
// example, GVN can combine
@@ -2117,20 +2248,23 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const {
//
// but EarlyCSE can do neither of them.
if (isPassEnabled(EnableScalarIRPasses))
- addEarlyCSEOrGVNPass(addPass);
+ addEarlyCSEOrGVNPass(PMW);
}
-void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const {
- if (TM.getOptLevel() > CodeGenOptLevel::None)
- addPass(AMDGPUPreloadKernelArgumentsPass(TM));
+void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(
+ PassManagerWrapper &PMW) const {
+ if (TM.getOptLevel() > CodeGenOptLevel::None) {
+ flushFPMsToMPM(PMW);
+ addModulePass(AMDGPUPreloadKernelArgumentsPass(TM), PMW);
+ }
if (EnableLowerKernelArguments)
- addPass(AMDGPULowerKernelArgumentsPass(TM));
+ addFunctionPass(AMDGPULowerKernelArgumentsPass(TM), PMW);
- Base::addCodeGenPrepare(addPass);
+ Base::addCodeGenPrepare(PMW);
if (isPassEnabled(EnableLoadStoreVectorizer))
- addPass(LoadStoreVectorizerPass());
+ addFunctionPass(LoadStoreVectorizerPass(), PMW);
// This lowering has been placed after codegenprepare to take advantage of
// address mode matching (which is why it isn't put with the LDS lowerings).
@@ -2139,102 +2273,160 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const {
// but has been put before switch lowering and CFG flattening so that those
// passes can run on the more optimized control flow this pass creates in
// many cases.
- addPass(AMDGPULowerBufferFatPointersPass(TM));
- addPass.requireCGSCCOrder();
+ flushFPMsToMPM(PMW);
+ addModulePass(AMDGPULowerBufferFatPointersPass(TM), PMW);
+ flushFPMsToMPM(PMW);
+ requireCGSCCOrder(PMW);
- addPass(AMDGPULowerIntrinsicsPass(TM));
+ addModulePass(AMDGPULowerIntrinsicsPass(TM), PMW);
// LowerSwitch pass may introduce unreachable blocks that can cause unexpected
// behavior for subsequent passes. Placing it here seems better that these
// blocks would get cleaned up by UnreachableBlockElim inserted next in the
// pass flow.
- addPass(LowerSwitchPass());
+ addFunctionPass(LowerSwitchPass(), PMW);
}
-void AMDGPUCodeGenPassBuilder::addPreISel(AddIRPass &addPass) const {
+void AMDGPUCodeGenPassBuilder::addPreISel(PassManagerWrapper &PMW) const {
+
+ // Require AMDGPUArgumentUsageAnalysis so that it's available during ISel.
+ flushFPMsToMPM(PMW);
+ addModulePass(RequireAnalysisPass<AMDGPUArgumentUsageAnalysis, Module>(),
+ PMW);
if (TM.getOptLevel() > CodeGenOptLevel::None) {
- addPass(FlattenCFGPass());
- addPass(SinkingPass());
- addPass(AMDGPULateCodeGenPreparePass(TM));
+ addFunctionPass(FlattenCFGPass(), PMW);
+ addFunctionPass(SinkingPass(), PMW);
+ addFunctionPass(AMDGPULateCodeGenPreparePass(TM), PMW);
}
// Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
// regions formed by them.
- addPass(AMDGPUUnifyDivergentExitNodesPass());
- addPass(FixIrreduciblePass());
- addPass(UnifyLoopExitsPass());
- addPass(StructurizeCFGPass(/*SkipUniformRegions=*/false));
+ addFunctionPass(AMDGPUUnifyDivergentExitNodesPass(), PMW);
+ addFunctionPass(FixIrreduciblePass(), PMW);
+ addFunctionPass(UnifyLoopExitsPass(), PMW);
+ addFunctionPass(StructurizeCFGPass(/*SkipUniformRegions=*/false), PMW);
- addPass(AMDGPUAnnotateUniformValuesPass());
+ addFunctionPass(AMDGPUAnnotateUniformValuesPass(), PMW);
- addPass(SIAnnotateControlFlowPass(TM));
+ addFunctionPass(SIAnnotateControlFlowPass(TM), PMW);
// TODO: Move this right after structurizeCFG to avoid extra divergence
// analysis. This depends on stopping SIAnnotateControlFlow from making
// control flow modifications.
- addPass(AMDGPURewriteUndefForPHIPass());
+ addFunctionPass(AMDGPURewriteUndefForPHIPass(), PMW);
if (!getCGPassBuilderOption().EnableGlobalISelOption ||
!isGlobalISelAbortEnabled() || !NewRegBankSelect)
- addPass(LCSSAPass());
+ addFunctionPass(LCSSAPass(), PMW);
- if (TM.getOptLevel() > CodeGenOptLevel::Less)
- addPass(AMDGPUPerfHintAnalysisPass(TM));
+ if (TM.getOptLevel() > CodeGenOptLevel::Less) {
+ flushFPMsToMPM(PMW);
+ addModulePass(AMDGPUPerfHintAnalysisPass(TM), PMW);
+ }
// FIXME: Why isn't this queried as required from AMDGPUISelDAGToDAG, and why
// isn't this in addInstSelector?
- addPass(RequireAnalysisPass<UniformityInfoAnalysis, Function>(),
- /*Force=*/true);
+ addFunctionPass(RequireAnalysisPass<UniformityInfoAnalysis, Function>(), PMW,
+ /*Force=*/true);
}
-void AMDGPUCodeGenPassBuilder::addILPOpts(AddMachinePass &addPass) const {
+void AMDGPUCodeGenPassBuilder::addILPOpts(PassManagerWrapper &PMW) const {
if (EnableEarlyIfConversion)
- addPass(EarlyIfConverterPass());
+ addMachineFunctionPass(EarlyIfConverterPass(), PMW);
- Base::addILPOpts(addPass);
+ Base::addILPOpts(PMW);
}
-void AMDGPUCodeGenPassBuilder::addAsmPrinter(AddMachinePass &addPass,
+void AMDGPUCodeGenPassBuilder::addAsmPrinter(PassManagerWrapper &PMW,
CreateMCStreamer) const {
// TODO: Add AsmPrinter.
}
-Error AMDGPUCodeGenPassBuilder::addInstSelector(AddMachinePass &addPass) const {
- addPass(AMDGPUISelDAGToDAGPass(TM));
- addPass(SIFixSGPRCopiesPass());
- addPass(SILowerI1CopiesPass());
+Error AMDGPUCodeGenPassBuilder::addInstSelector(PassManagerWrapper &PMW) const {
+ addMachineFunctionPass(AMDGPUISelDAGToDAGPass(TM), PMW);
+ addMachineFunctionPass(SIFixSGPRCopiesPass(), PMW);
+ addMachineFunctionPass(SILowerI1CopiesPass(), PMW);
return Error::success();
}
-void AMDGPUCodeGenPassBuilder::addPreRewrite(AddMachinePass &addPass) const {
+void AMDGPUCodeGenPassBuilder::addPreRewrite(PassManagerWrapper &PMW) const {
if (EnableRegReassign) {
- addPass(GCNNSAReassignPass());
+ addMachineFunctionPass(GCNNSAReassignPass(), PMW);
}
+
+ addMachineFunctionPass(AMDGPURewriteAGPRCopyMFMAPass(), PMW);
}
void AMDGPUCodeGenPassBuilder::addMachineSSAOptimization(
- AddMachinePass &addPass) const {
- Base::addMachineSSAOptimization(addPass);
+ PassManagerWrapper &PMW) const {
+ Base::addMachineSSAOptimization(PMW);
- addPass(SIFoldOperandsPass());
+ addMachineFunctionPass(SIFoldOperandsPass(), PMW);
if (EnableDPPCombine) {
- addPass(GCNDPPCombinePass());
+ addMachineFunctionPass(GCNDPPCombinePass(), PMW);
}
- addPass(SILoadStoreOptimizerPass());
+ addMachineFunctionPass(SILoadStoreOptimizerPass(), PMW);
if (isPassEnabled(EnableSDWAPeephole)) {
- addPass(SIPeepholeSDWAPass());
- addPass(EarlyMachineLICMPass());
- addPass(MachineCSEPass());
- addPass(SIFoldOperandsPass());
+ addMachineFunctionPass(SIPeepholeSDWAPass(), PMW);
+ addMachineFunctionPass(EarlyMachineLICMPass(), PMW);
+ addMachineFunctionPass(MachineCSEPass(), PMW);
+ addMachineFunctionPass(SIFoldOperandsPass(), PMW);
}
- addPass(DeadMachineInstructionElimPass());
- addPass(SIShrinkInstructionsPass());
+ addMachineFunctionPass(DeadMachineInstructionElimPass(), PMW);
+ addMachineFunctionPass(SIShrinkInstructionsPass(), PMW);
+}
+
+Error AMDGPUCodeGenPassBuilder::addFastRegAlloc(PassManagerWrapper &PMW) const {
+ insertPass<PHIEliminationPass>(SILowerControlFlowPass());
+
+ insertPass<TwoAddressInstructionPass>(SIWholeQuadModePass());
+
+ return Base::addFastRegAlloc(PMW);
}
-void AMDGPUCodeGenPassBuilder::addOptimizedRegAlloc(
- AddMachinePass &addPass) const {
+Error AMDGPUCodeGenPassBuilder::addRegAssignmentFast(
+ PassManagerWrapper &PMW) const {
+ if (auto Err = validateRegAllocOptions())
+ return Err;
+
+ addMachineFunctionPass(GCNPreRALongBranchRegPass(), PMW);
+
+ // SGPR allocation - default to fast at -O0.
+ if (SGPRRegAllocNPM == RegAllocType::Greedy)
+ addMachineFunctionPass(RAGreedyPass({onlyAllocateSGPRs, "sgpr"}), PMW);
+ else
+ addMachineFunctionPass(RegAllocFastPass({onlyAllocateSGPRs, "sgpr", false}),
+ PMW);
+
+ // Equivalent of PEI for SGPRs.
+ addMachineFunctionPass(SILowerSGPRSpillsPass(), PMW);
+
+ // To Allocate wwm registers used in whole quad mode operations (for shaders).
+ addMachineFunctionPass(SIPreAllocateWWMRegsPass(), PMW);
+
+ // WWM allocation - default to fast at -O0.
+ if (WWMRegAllocNPM == RegAllocType::Greedy)
+ addMachineFunctionPass(RAGreedyPass({onlyAllocateWWMRegs, "wwm"}), PMW);
+ else
+ addMachineFunctionPass(
+ RegAllocFastPass({onlyAllocateWWMRegs, "wwm", false}), PMW);
+
+ addMachineFunctionPass(SILowerWWMCopiesPass(), PMW);
+ addMachineFunctionPass(AMDGPUReserveWWMRegsPass(), PMW);
+
+ // VGPR allocation - default to fast at -O0.
+ if (VGPRRegAllocNPM == RegAllocType::Greedy)
+ addMachineFunctionPass(RAGreedyPass({onlyAllocateVGPRs, "vgpr"}), PMW);
+ else
+ addMachineFunctionPass(RegAllocFastPass({onlyAllocateVGPRs, "vgpr"}), PMW);
+
+ return Error::success();
+}
+
+Error AMDGPUCodeGenPassBuilder::addOptimizedRegAlloc(
+ PassManagerWrapper &PMW) const {
if (EnableDCEInRA)
insertPass<DetectDeadLanesPass>(DeadMachineInstructionElimPass());
@@ -2269,90 +2461,108 @@ void AMDGPUCodeGenPassBuilder::addOptimizedRegAlloc(
if (TM.getOptLevel() > CodeGenOptLevel::Less)
insertPass<MachineSchedulerPass>(SIFormMemoryClausesPass());
- Base::addOptimizedRegAlloc(addPass);
+ return Base::addOptimizedRegAlloc(PMW);
}
-void AMDGPUCodeGenPassBuilder::addPreRegAlloc(AddMachinePass &addPass) const {
+void AMDGPUCodeGenPassBuilder::addPreRegAlloc(PassManagerWrapper &PMW) const {
if (getOptLevel() != CodeGenOptLevel::None)
- addPass(AMDGPUPrepareAGPRAllocPass());
+ addMachineFunctionPass(AMDGPUPrepareAGPRAllocPass(), PMW);
}
Error AMDGPUCodeGenPassBuilder::addRegAssignmentOptimized(
- AddMachinePass &addPass) const {
- // TODO: Check --regalloc-npm option
+ PassManagerWrapper &PMW) const {
+ if (auto Err = validateRegAllocOptions())
+ return Err;
- addPass(GCNPreRALongBranchRegPass());
+ addMachineFunctionPass(GCNPreRALongBranchRegPass(), PMW);
- addPass(RAGreedyPass({onlyAllocateSGPRs, "sgpr"}));
+ // SGPR allocation - default to greedy at -O1 and above.
+ if (SGPRRegAllocNPM == RegAllocType::Fast)
+ addMachineFunctionPass(RegAllocFastPass({onlyAllocateSGPRs, "sgpr", false}),
+ PMW);
+ else
+ addMachineFunctionPass(RAGreedyPass({onlyAllocateSGPRs, "sgpr"}), PMW);
// Commit allocated register changes. This is mostly necessary because too
// many things rely on the use lists of the physical registers, such as the
// verifier. This is only necessary with allocators which use LiveIntervals,
// since FastRegAlloc does the replacements itself.
- addPass(VirtRegRewriterPass(false));
+ addMachineFunctionPass(VirtRegRewriterPass(false), PMW);
// At this point, the sgpr-regalloc has been done and it is good to have the
// stack slot coloring to try to optimize the SGPR spill stack indices before
// attempting the custom SGPR spill lowering.
- addPass(StackSlotColoringPass());
+ addMachineFunctionPass(StackSlotColoringPass(), PMW);
// Equivalent of PEI for SGPRs.
- addPass(SILowerSGPRSpillsPass());
+ addMachineFunctionPass(SILowerSGPRSpillsPass(), PMW);
// To Allocate wwm registers used in whole quad mode operations (for shaders).
- addPass(SIPreAllocateWWMRegsPass());
-
- // For allocating other wwm register operands.
- addPass(RAGreedyPass({onlyAllocateWWMRegs, "wwm"}));
- addPass(SILowerWWMCopiesPass());
- addPass(VirtRegRewriterPass(false));
- addPass(AMDGPUReserveWWMRegsPass());
-
- // For allocating per-thread VGPRs.
- addPass(RAGreedyPass({onlyAllocateVGPRs, "vgpr"}));
+ addMachineFunctionPass(SIPreAllocateWWMRegsPass(), PMW);
+ // WWM allocation - default to greedy at -O1 and above.
+ if (WWMRegAllocNPM == RegAllocType::Fast)
+ addMachineFunctionPass(
+ RegAllocFastPass({onlyAllocateWWMRegs, "wwm", false}), PMW);
+ else
+ addMachineFunctionPass(RAGreedyPass({onlyAllocateWWMRegs, "wwm"}), PMW);
+ addMachineFunctionPass(SILowerWWMCopiesPass(), PMW);
+ addMachineFunctionPass(VirtRegRewriterPass(false), PMW);
+ addMachineFunctionPass(AMDGPUReserveWWMRegsPass(), PMW);
+
+ // VGPR allocation - default to greedy at -O1 and above.
+ if (VGPRRegAllocNPM == RegAllocType::Fast)
+ addMachineFunctionPass(RegAllocFastPass({onlyAllocateVGPRs, "vgpr"}), PMW);
+ else
+ addMachineFunctionPass(RAGreedyPass({onlyAllocateVGPRs, "vgpr"}), PMW);
- addPreRewrite(addPass);
- addPass(VirtRegRewriterPass(true));
+ addPreRewrite(PMW);
+ addMachineFunctionPass(VirtRegRewriterPass(true), PMW);
- addPass(AMDGPUMarkLastScratchLoadPass());
+ addMachineFunctionPass(AMDGPUMarkLastScratchLoadPass(), PMW);
return Error::success();
}
-void AMDGPUCodeGenPassBuilder::addPostRegAlloc(AddMachinePass &addPass) const {
- addPass(SIFixVGPRCopiesPass());
+void AMDGPUCodeGenPassBuilder::addPostRegAlloc(PassManagerWrapper &PMW) const {
+ addMachineFunctionPass(SIFixVGPRCopiesPass(), PMW);
if (TM.getOptLevel() > CodeGenOptLevel::None)
- addPass(SIOptimizeExecMaskingPass());
- Base::addPostRegAlloc(addPass);
+ addMachineFunctionPass(SIOptimizeExecMaskingPass(), PMW);
+ Base::addPostRegAlloc(PMW);
}
-void AMDGPUCodeGenPassBuilder::addPreSched2(AddMachinePass &addPass) const {
+void AMDGPUCodeGenPassBuilder::addPreSched2(PassManagerWrapper &PMW) const {
if (TM.getOptLevel() > CodeGenOptLevel::None)
- addPass(SIShrinkInstructionsPass());
- addPass(SIPostRABundlerPass());
+ addMachineFunctionPass(SIShrinkInstructionsPass(), PMW);
+ addMachineFunctionPass(SIPostRABundlerPass(), PMW);
}
-void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const {
+void AMDGPUCodeGenPassBuilder::addPostBBSections(
+ PassManagerWrapper &PMW) const {
+ // We run this later to avoid passes like livedebugvalues and BBSections
+ // having to deal with the apparent multi-entry functions we may generate.
+ addMachineFunctionPass(AMDGPUPreloadKernArgPrologPass(), PMW);
+}
+
+void AMDGPUCodeGenPassBuilder::addPreEmitPass(PassManagerWrapper &PMW) const {
if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less)) {
- addPass(GCNCreateVOPDPass());
+ addMachineFunctionPass(GCNCreateVOPDPass(), PMW);
}
- addPass(SIMemoryLegalizerPass());
- addPass(SIInsertWaitcntsPass());
+ addMachineFunctionPass(SIMemoryLegalizerPass(), PMW);
+ addMachineFunctionPass(SIInsertWaitcntsPass(), PMW);
- // TODO: addPass(SIModeRegisterPass());
+ addMachineFunctionPass(SIModeRegisterPass(), PMW);
- if (TM.getOptLevel() > CodeGenOptLevel::None) {
- // TODO: addPass(SIInsertHardClausesPass());
- }
+ if (TM.getOptLevel() > CodeGenOptLevel::None)
+ addMachineFunctionPass(SIInsertHardClausesPass(), PMW);
- addPass(SILateBranchLoweringPass());
+ addMachineFunctionPass(SILateBranchLoweringPass(), PMW);
if (isPassEnabled(EnableSetWavePriority, CodeGenOptLevel::Less))
- addPass(AMDGPUSetWavePriorityPass());
+ addMachineFunctionPass(AMDGPUSetWavePriorityPass(), PMW);
if (TM.getOptLevel() > CodeGenOptLevel::None)
- addPass(SIPreEmitPeepholePass());
+ addMachineFunctionPass(SIPreEmitPeepholePass(), PMW);
// The hazard recognizer that runs as part of the post-ra scheduler does not
// guarantee to be able handle all hazards correctly. This is because if there
@@ -2362,15 +2572,15 @@ void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const {
//
// Here we add a stand-alone hazard recognizer pass which can handle all
// cases.
- addPass(PostRAHazardRecognizerPass());
- addPass(AMDGPUWaitSGPRHazardsPass());
- addPass(AMDGPULowerVGPREncodingPass());
+ addMachineFunctionPass(PostRAHazardRecognizerPass(), PMW);
+ addMachineFunctionPass(AMDGPUWaitSGPRHazardsPass(), PMW);
+ addMachineFunctionPass(AMDGPULowerVGPREncodingPass(), PMW);
if (isPassEnabled(EnableInsertDelayAlu, CodeGenOptLevel::Less)) {
- addPass(AMDGPUInsertDelayAluPass());
+ addMachineFunctionPass(AMDGPUInsertDelayAluPass(), PMW);
}
- addPass(BranchRelaxationPass());
+ addMachineFunctionPass(BranchRelaxationPass(), PMW);
}
bool AMDGPUCodeGenPassBuilder::isPassEnabled(const cl::opt<bool> &Opt,
@@ -2382,32 +2592,33 @@ bool AMDGPUCodeGenPassBuilder::isPassEnabled(const cl::opt<bool> &Opt,
return Opt;
}
-void AMDGPUCodeGenPassBuilder::addEarlyCSEOrGVNPass(AddIRPass &addPass) const {
+void AMDGPUCodeGenPassBuilder::addEarlyCSEOrGVNPass(
+ PassManagerWrapper &PMW) const {
if (TM.getOptLevel() == CodeGenOptLevel::Aggressive)
- addPass(GVNPass());
+ addFunctionPass(GVNPass(), PMW);
else
- addPass(EarlyCSEPass());
+ addFunctionPass(EarlyCSEPass(), PMW);
}
void AMDGPUCodeGenPassBuilder::addStraightLineScalarOptimizationPasses(
- AddIRPass &addPass) const {
+ PassManagerWrapper &PMW) const {
if (isPassEnabled(EnableLoopPrefetch, CodeGenOptLevel::Aggressive))
- addPass(LoopDataPrefetchPass());
+ addFunctionPass(LoopDataPrefetchPass(), PMW);
- addPass(SeparateConstOffsetFromGEPPass());
+ addFunctionPass(SeparateConstOffsetFromGEPPass(), PMW);
// ReassociateGEPs exposes more opportunities for SLSR. See
// the example in reassociate-geps-and-slsr.ll.
- addPass(StraightLineStrengthReducePass());
+ addFunctionPass(StraightLineStrengthReducePass(), PMW);
// SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
// EarlyCSE can reuse.
- addEarlyCSEOrGVNPass(addPass);
+ addEarlyCSEOrGVNPass(PMW);
// Run NaryReassociate after EarlyCSE/GVN to be more effective.
- addPass(NaryReassociatePass());
+ addFunctionPass(NaryReassociatePass(), PMW);
// NaryReassociate on GEPs creates redundant common expressions, so run
// EarlyCSE after it.
- addPass(EarlyCSEPass());
+ addFunctionPass(EarlyCSEPass(), PMW);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 03d16fd..d4a6838 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -80,7 +80,8 @@ static cl::opt<size_t> InlineMaxBB(
static cl::opt<unsigned> MemcpyLoopUnroll(
"amdgpu-memcpy-loop-unroll",
cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory "
- "operations when lowering memcpy as a loop"),
+ "operations when lowering statically-sized memcpy, memmove, or"
+ "memset as a loop"),
cl::init(16), cl::Hidden);
static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
@@ -206,9 +207,8 @@ void AMDGPUTTIImpl::getUnrollingPreferences(
dyn_cast<AllocaInst>(getUnderlyingObject(Ptr));
if (!Alloca || !Alloca->isStaticAlloca())
continue;
- Type *Ty = Alloca->getAllocatedType();
- unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
- if (AllocaSize > MaxAlloca)
+ auto AllocaSize = Alloca->getAllocationSize(DL);
+ if (!AllocaSize || AllocaSize->getFixedValue() > MaxAlloca)
continue;
} else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
AS == AMDGPUAS::REGION_ADDRESS) {
@@ -285,7 +285,7 @@ uint64_t AMDGPUTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
// Codegen control options which don't matter.
AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
- AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal,
+ AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureUseFlatForGlobal,
AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess,
AMDGPU::FeatureUnalignedAccessMode,
@@ -300,7 +300,7 @@ const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
AMDGPU::FeatureSRAMECC,
// Perf-tuning features
- AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops};
+ AMDGPU::FeatureFastFMAF32, AMDGPU::FeatureHalfRate64Ops};
GCNTTIImpl::GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
: BaseT(TM, F.getDataLayout()),
@@ -804,7 +804,7 @@ GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
InstRate = getFullRateInstrCost();
static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
- if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
+ if (any_of(ValidSatTys, equal_to(LT.second)))
NElts = 1;
break;
}
@@ -883,10 +883,9 @@ GCNTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
return LT.first * getHalfRateInstrCost(CostKind);
}
-InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
- TTI::TargetCostKind CostKind,
- unsigned Index, const Value *Op0,
- const Value *Op1) const {
+InstructionCost GCNTTIImpl::getVectorInstrCost(
+ unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index,
+ const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
switch (Opcode) {
case Instruction::ExtractElement:
case Instruction::InsertElement: {
@@ -895,8 +894,8 @@ InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
if (EltSize < 32) {
if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
return 0;
- return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0,
- Op1);
+ return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1,
+ VIC);
}
// Extracts are just reads of a subregister, so are free. Inserts are
@@ -907,7 +906,8 @@ InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
return Index == ~0u ? 2 : 0;
}
default:
- return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1);
+ return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1,
+ VIC);
}
}
@@ -1150,41 +1150,6 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
ConstantInt::getTrue(Ctx) : ConstantInt::getFalse(Ctx);
return NewVal;
}
- case Intrinsic::ptrmask: {
- unsigned OldAS = OldV->getType()->getPointerAddressSpace();
- unsigned NewAS = NewV->getType()->getPointerAddressSpace();
- Value *MaskOp = II->getArgOperand(1);
- Type *MaskTy = MaskOp->getType();
-
- bool DoTruncate = false;
-
- const GCNTargetMachine &TM =
- static_cast<const GCNTargetMachine &>(getTLI()->getTargetMachine());
- if (!TM.isNoopAddrSpaceCast(OldAS, NewAS)) {
- // All valid 64-bit to 32-bit casts work by chopping off the high
- // bits. Any masking only clearing the low bits will also apply in the new
- // address space.
- if (DL.getPointerSizeInBits(OldAS) != 64 ||
- DL.getPointerSizeInBits(NewAS) != 32)
- return nullptr;
-
- // TODO: Do we need to thread more context in here?
- KnownBits Known = computeKnownBits(MaskOp, DL, nullptr, II);
- if (Known.countMinLeadingOnes() < 32)
- return nullptr;
-
- DoTruncate = true;
- }
-
- IRBuilder<> B(II);
- if (DoTruncate) {
- MaskTy = B.getInt32Ty();
- MaskOp = B.CreateTrunc(MaskOp, MaskTy);
- }
-
- return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy},
- {NewV, MaskOp});
- }
case Intrinsic::amdgcn_flat_atomic_fmax_num:
case Intrinsic::amdgcn_flat_atomic_fmin_num: {
Type *DestTy = II->getType();
@@ -1241,46 +1206,123 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
(ScalarSize == 16 || ScalarSize == 8)) {
// Larger vector widths may require additional instructions, but are
// typically cheaper than scalarized versions.
- unsigned NumVectorElts = cast<FixedVectorType>(SrcTy)->getNumElements();
- unsigned RequestedElts =
- count_if(Mask, [](int MaskElt) { return MaskElt != -1; });
+ //
+ // We assume that shuffling at a register granularity can be done for free.
+ // This is not true for vectors fed into memory instructions, but it is
+ // effectively true for all other shuffling. The emphasis of the logic here
+ // is to assist generic transform in cleaning up / canonicalizing those
+ // shuffles.
+
+ // With op_sel VOP3P instructions freely can access the low half or high
+ // half of a register, so any swizzle of two elements is free.
+ if (auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcTy)) {
+ unsigned NumSrcElts = SrcVecTy->getNumElements();
+ if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumSrcElts == 2 &&
+ (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Reverse ||
+ Kind == TTI::SK_PermuteSingleSrc))
+ return 0;
+ }
+
unsigned EltsPerReg = 32 / ScalarSize;
- if (RequestedElts == 0)
- return 0;
switch (Kind) {
case TTI::SK_Broadcast:
+ // A single v_perm_b32 can be re-used for all destination registers.
+ return 1;
case TTI::SK_Reverse:
- case TTI::SK_PermuteSingleSrc: {
- // With op_sel VOP3P instructions freely can access the low half or high
- // half of a register, so any swizzle of two elements is free.
- if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumVectorElts == 2)
- return 0;
- unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
- // SK_Broadcast just reuses the same mask
- unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;
- return NumPerms + NumPermMasks;
- }
+ // One instruction per register.
+ if (auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy))
+ return divideCeil(DstVecTy->getNumElements(), EltsPerReg);
+ return InstructionCost::getInvalid();
case TTI::SK_ExtractSubvector:
+ if (Index % EltsPerReg == 0)
+ return 0; // Shuffling at register granularity
+ if (auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy))
+ return divideCeil(DstVecTy->getNumElements(), EltsPerReg);
+ return InstructionCost::getInvalid();
case TTI::SK_InsertSubvector: {
- // Even aligned accesses are free
- if (!(Index % 2))
- return 0;
- // Insert/extract subvectors only require shifts / extract code to get the
- // relevant bits
- return alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
+ auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy);
+ if (!DstVecTy)
+ return InstructionCost::getInvalid();
+ unsigned NumDstElts = DstVecTy->getNumElements();
+ unsigned NumInsertElts = cast<FixedVectorType>(SubTp)->getNumElements();
+ unsigned EndIndex = Index + NumInsertElts;
+ unsigned BeginSubIdx = Index % EltsPerReg;
+ unsigned EndSubIdx = EndIndex % EltsPerReg;
+ unsigned Cost = 0;
+
+ if (BeginSubIdx != 0) {
+ // Need to shift the inserted vector into place. The cost is the number
+ // of destination registers overlapped by the inserted vector.
+ Cost = divideCeil(EndIndex, EltsPerReg) - (Index / EltsPerReg);
+ }
+
+ // If the last register overlap is partial, there may be three source
+ // registers feeding into it; that takes an extra instruction.
+ if (EndIndex < NumDstElts && BeginSubIdx < EndSubIdx)
+ Cost += 1;
+
+ return Cost;
}
- case TTI::SK_PermuteTwoSrc:
- case TTI::SK_Splice:
- case TTI::SK_Select: {
- unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
- // SK_Select just reuses the same mask
- unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;
- return NumPerms + NumPermMasks;
+ case TTI::SK_Splice: {
+ auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy);
+ if (!DstVecTy)
+ return InstructionCost::getInvalid();
+ unsigned NumElts = DstVecTy->getNumElements();
+ assert(NumElts == cast<FixedVectorType>(SrcTy)->getNumElements());
+ // Determine the sub-region of the result vector that requires
+ // sub-register shuffles / mixing.
+ unsigned EltsFromLHS = NumElts - Index;
+ bool LHSIsAligned = (Index % EltsPerReg) == 0;
+ bool RHSIsAligned = (EltsFromLHS % EltsPerReg) == 0;
+ if (LHSIsAligned && RHSIsAligned)
+ return 0;
+ if (LHSIsAligned && !RHSIsAligned)
+ return divideCeil(NumElts, EltsPerReg) - (EltsFromLHS / EltsPerReg);
+ if (!LHSIsAligned && RHSIsAligned)
+ return divideCeil(EltsFromLHS, EltsPerReg);
+ return divideCeil(NumElts, EltsPerReg);
}
-
default:
break;
}
+
+ if (!Mask.empty()) {
+ unsigned NumSrcElts = cast<FixedVectorType>(SrcTy)->getNumElements();
+
+ // Generically estimate the cost by assuming that each destination
+ // register is derived from sources via v_perm_b32 instructions if it
+ // can't be copied as-is.
+ //
+ // For each destination register, derive the cost of obtaining it based
+ // on the number of source registers that feed into it.
+ unsigned Cost = 0;
+ for (unsigned DstIdx = 0; DstIdx < Mask.size(); DstIdx += EltsPerReg) {
+ SmallVector<int, 4> Regs;
+ bool Aligned = true;
+ for (unsigned I = 0; I < EltsPerReg && DstIdx + I < Mask.size(); ++I) {
+ int SrcIdx = Mask[DstIdx + I];
+ if (SrcIdx == -1)
+ continue;
+ int Reg;
+ if (SrcIdx < (int)NumSrcElts) {
+ Reg = SrcIdx / EltsPerReg;
+ if (SrcIdx % EltsPerReg != I)
+ Aligned = false;
+ } else {
+ Reg = NumSrcElts + (SrcIdx - NumSrcElts) / EltsPerReg;
+ if ((SrcIdx - NumSrcElts) % EltsPerReg != I)
+ Aligned = false;
+ }
+ if (!llvm::is_contained(Regs, Reg))
+ Regs.push_back(Reg);
+ }
+ if (Regs.size() >= 2)
+ Cost += Regs.size() - 1;
+ else if (!Aligned)
+ Cost += 1;
+ }
+ return Cost;
+ }
}
return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
@@ -1299,8 +1341,60 @@ bool GCNTTIImpl::isProfitableToSinkOperands(Instruction *I,
if (any_of(Ops, [&](Use *U) { return U->get() == Op.get(); }))
continue;
- if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value())))
+ if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value()))) {
Ops.push_back(&Op);
+ continue;
+ }
+
+ // Check for zero-cost multiple use InsertElement/ExtractElement
+ // instructions
+ if (Instruction *OpInst = dyn_cast<Instruction>(Op.get())) {
+ if (OpInst->getType()->isVectorTy() && OpInst->getNumOperands() > 1) {
+ Instruction *VecOpInst = dyn_cast<Instruction>(OpInst->getOperand(0));
+ if (VecOpInst && VecOpInst->hasOneUse())
+ continue;
+
+ if (getVectorInstrCost(OpInst->getOpcode(), OpInst->getType(),
+ TTI::TCK_RecipThroughput, 0,
+ OpInst->getOperand(0),
+ OpInst->getOperand(1)) == 0) {
+ Ops.push_back(&Op);
+ continue;
+ }
+ }
+ }
+
+ if (auto *Shuffle = dyn_cast<ShuffleVectorInst>(Op.get())) {
+
+ unsigned EltSize = DL.getTypeSizeInBits(
+ cast<VectorType>(Shuffle->getType())->getElementType());
+
+ // For i32 (or greater) shufflevectors, these will be lowered into a
+ // series of insert / extract elements, which will be coalesced away.
+ if (EltSize < 16 || !ST->has16BitInsts())
+ continue;
+
+ int NumSubElts, SubIndex;
+ if (Shuffle->changesLength()) {
+ if (Shuffle->increasesLength() && Shuffle->isIdentityWithPadding()) {
+ Ops.push_back(&Op);
+ continue;
+ }
+
+ if ((Shuffle->isExtractSubvectorMask(SubIndex) ||
+ Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex)) &&
+ !(SubIndex & 0x1)) {
+ Ops.push_back(&Op);
+ continue;
+ }
+ }
+
+ if (Shuffle->isReverse() || Shuffle->isZeroEltSplat() ||
+ Shuffle->isSingleSource()) {
+ Ops.push_back(&Op);
+ continue;
+ }
+ }
}
return !Ops.empty();
@@ -1413,7 +1507,8 @@ static unsigned getCallArgsTotalAllocaSize(const CallBase *CB,
if (!AI || !AI->isStaticAlloca() || !AIVisited.insert(AI).second)
continue;
- AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());
+ if (auto Size = AI->getAllocationSize(DL))
+ AllocaSize += Size->getFixedValue();
}
return AllocaSize;
}
@@ -1467,10 +1562,13 @@ unsigned GCNTTIImpl::getCallerAllocaCost(const CallBase *CB,
Threshold += Threshold / 2;
}
- auto ArgAllocaSize = DL.getTypeAllocSize(AI->getAllocatedType());
+ auto ArgAllocaSize = AI->getAllocationSize(DL);
+ if (!ArgAllocaSize)
+ return 0;
// Attribute the bonus proportionally to the alloca size
- unsigned AllocaThresholdBonus = (Threshold * ArgAllocaSize) / AllocaSize;
+ unsigned AllocaThresholdBonus =
+ (Threshold * ArgAllocaSize->getFixedValue()) / AllocaSize;
return AllocaThresholdBonus;
}
@@ -1574,3 +1672,14 @@ unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) const {
}
return BaseT::getNumberOfParts(Tp);
}
+
+InstructionUniformity
+GCNTTIImpl::getInstructionUniformity(const Value *V) const {
+ if (isAlwaysUniform(V))
+ return InstructionUniformity::AlwaysUniform;
+
+ if (isSourceOfDivergence(V))
+ return InstructionUniformity::NeverUniform;
+
+ return InstructionUniformity::Default;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 20da834..3ec157aa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -101,6 +101,14 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const;
+ /// \returns true if V might be divergent even when all of its operands
+ /// are uniform.
+ bool isSourceOfDivergence(const Value *V) const;
+
+ /// Returns true for the target specific set of operations which produce
+ /// uniform result even taking non-uniform arguments.
+ bool isAlwaysUniform(const Value *V) const;
+
public:
explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F);
@@ -168,14 +176,13 @@ public:
ArrayRef<unsigned> Indices = {}) const;
using BaseT::getVectorInstrCost;
- InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy,
- TTI::TargetCostKind CostKind,
- unsigned Index, const Value *Op0,
- const Value *Op1) const override;
+ InstructionCost
+ getVectorInstrCost(unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind,
+ unsigned Index, const Value *Op0, const Value *Op1,
+ TTI::VectorInstrContext VIC =
+ TTI::VectorInstrContext::None) const override;
bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const;
- bool isSourceOfDivergence(const Value *V) const override;
- bool isAlwaysUniform(const Value *V) const override;
bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const override {
// Address space casts must cast between different address spaces.
@@ -302,6 +309,8 @@ public:
/// together under a single i32 value. Otherwise fall back to base
/// implementation.
unsigned getNumberOfParts(Type *Tp) const override;
+
+ InstructionUniformity getInstructionUniformity(const Value *V) const override;
};
} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
index 50c78d8..864d877 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
@@ -16,12 +16,6 @@
/// uniformity. And every instruction that's downstream and cares about dynamic
/// uniformity must be convergent (and isel will introduce v_readfirstlane for
/// them if their operands can't be proven statically uniform).
-///
-/// This pass is implemented as a ModulePass because intrinsic declarations
-/// exist at the module scope, allowing us to skip processing entirely if no
-/// declarations are present and to traverse their user lists directly when
-/// they are. A FunctionPass would instead require scanning every instruction
-/// in every function to find relevant intrinsics, which is far less efficient.
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
@@ -63,10 +57,11 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II,
const UniformityInfo &UI,
ValueMap<const Value *, bool> &Tracker) {
llvm::Intrinsic::ID IID = II.getIntrinsicID();
-
+ /// We deliberately do not simplify readfirstlane with a uniform argument, so
+ /// that frontends can use it to force a copy to SGPR and thereby prevent the
+ /// backend from generating unwanted waterfall loops.
switch (IID) {
case Intrinsic::amdgcn_permlane64:
- case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_readlane: {
Value *Src = II.getArgOperand(0);
if (isDivergentUseWithNew(II.getOperandUse(0), UI, Tracker))
@@ -97,14 +92,12 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II,
Tracker[NotOp] = true; // NOT preserves uniformity
LLVM_DEBUG(dbgs() << "Replacing ICMP_EQ: " << *NotOp << '\n');
ICmp->replaceAllUsesWith(NotOp);
- ICmp->eraseFromParent();
Changed = true;
} else if (Pred == ICmpInst::ICMP_NE && match(OtherOp, m_Zero())) {
// Case: (icmp ne %ballot, 0) -> %ballot_arg
LLVM_DEBUG(dbgs() << "Replacing ICMP_NE with ballot argument: "
<< *Src << '\n');
ICmp->replaceAllUsesWith(Src);
- ICmp->eraseFromParent();
Changed = true;
}
}
@@ -114,46 +107,95 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II,
II.eraseFromParent();
return Changed;
}
+ case Intrinsic::amdgcn_wave_shuffle: {
+ Use &Val = II.getOperandUse(0);
+ Use &Idx = II.getOperandUse(1);
+
+ // Like with readlane, if Value is uniform then just propagate it
+ if (!isDivergentUseWithNew(Val, UI, Tracker)) {
+ II.replaceAllUsesWith(Val);
+ II.eraseFromParent();
+ return true;
+ }
+
+ // Otherwise, when Index is uniform, this is just a readlane operation
+ if (isDivergentUseWithNew(Idx, UI, Tracker))
+ return false;
+
+ // The readlane intrinsic we want to call has the exact same function
+ // signature, so we can quickly modify the instruction in-place
+ Module *Mod = II.getModule();
+ II.setCalledFunction(Intrinsic::getOrInsertDeclaration(
+ Mod, Intrinsic::amdgcn_readlane, II.getType()));
+ return true;
+ }
default:
- llvm_unreachable("Unexpected intrinsic ID in optimizeUniformIntrinsic");
+ return false;
}
return false;
}
-/// Iterates over intrinsic declarations in the module to optimize their uses.
-static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) {
+/// Iterates over intrinsic calls in the Function to optimize.
+static bool runUniformIntrinsicCombine(Function &F, const UniformityInfo &UI) {
bool IsChanged = false;
ValueMap<const Value *, bool> Tracker;
- FunctionAnalysisManager &FAM =
- AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
- for (Function &F : M) {
- switch (F.getIntrinsicID()) {
- case Intrinsic::amdgcn_permlane64:
- case Intrinsic::amdgcn_readfirstlane:
- case Intrinsic::amdgcn_readlane:
- case Intrinsic::amdgcn_ballot:
- break;
- default:
+ for (Instruction &I : make_early_inc_range(instructions(F))) {
+ auto *II = dyn_cast<IntrinsicInst>(&I);
+ if (!II)
continue;
- }
-
- for (User *U : make_early_inc_range(F.users())) {
- auto *II = cast<IntrinsicInst>(U);
- Function *ParentF = II->getFunction();
- const auto &UI = FAM.getResult<UniformityInfoAnalysis>(*ParentF);
- IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker);
- }
+ IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker);
}
return IsChanged;
}
PreservedAnalyses
-AMDGPUUniformIntrinsicCombinePass::run(Module &M, ModuleAnalysisManager &AM) {
- if (!runUniformIntrinsicCombine(M, AM))
+AMDGPUUniformIntrinsicCombinePass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ const auto &UI = AM.getResult<UniformityInfoAnalysis>(F);
+ if (!runUniformIntrinsicCombine(F, UI))
return PreservedAnalyses::all();
PreservedAnalyses PA;
PA.preserve<UniformityInfoAnalysis>();
return PA;
}
+
+namespace {
+class AMDGPUUniformIntrinsicCombineLegacy : public FunctionPass {
+public:
+ static char ID;
+ AMDGPUUniformIntrinsicCombineLegacy() : FunctionPass(ID) {}
+
+private:
+ bool runOnFunction(Function &F) override;
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<UniformityInfoWrapperPass>();
+ AU.addRequired<TargetPassConfig>();
+ }
+};
+} // namespace
+
+char AMDGPUUniformIntrinsicCombineLegacy::ID = 0;
+char &llvm::AMDGPUUniformIntrinsicCombineLegacyPassID =
+ AMDGPUUniformIntrinsicCombineLegacy::ID;
+
+bool AMDGPUUniformIntrinsicCombineLegacy::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+ const UniformityInfo &UI =
+ getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
+ return runUniformIntrinsicCombine(F, UI);
+}
+
+INITIALIZE_PASS_BEGIN(AMDGPUUniformIntrinsicCombineLegacy, DEBUG_TYPE,
+ "AMDGPU Uniform Intrinsic Combine", false, false)
+INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(AMDGPUUniformIntrinsicCombineLegacy, DEBUG_TYPE,
+ "AMDGPU Uniform Intrinsic Combine", false, false)
+
+FunctionPass *llvm::createAMDGPUUniformIntrinsicCombineLegacyPass() {
+ return new AMDGPUUniformIntrinsicCombineLegacy();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
index 733c5d5..fe81a5e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -181,14 +181,52 @@ BasicBlock *AMDGPUUnifyDivergentExitNodesImpl::unifyReturnBlockSet(
return NewRetBlock;
}
+static BasicBlock *
+createDummyReturnBlock(Function &F,
+ SmallVector<BasicBlock *, 4> &ReturningBlocks) {
+ BasicBlock *DummyReturnBB =
+ BasicBlock::Create(F.getContext(), "DummyReturnBlock", &F);
+ Type *RetTy = F.getReturnType();
+ Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy);
+ ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
+ ReturningBlocks.push_back(DummyReturnBB);
+ return DummyReturnBB;
+}
+
+/// Handle conditional branch instructions (-> 2 targets) and callbr
+/// instructions with N targets.
+static void handleNBranch(Function &F, BasicBlock *BB, Instruction *BI,
+ BasicBlock *DummyReturnBB,
+ std::vector<DominatorTree::UpdateType> &Updates) {
+ SmallVector<BasicBlock *, 2> Successors(successors(BB));
+
+ // Create a new transition block to hold the conditional branch.
+ BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock");
+
+ Updates.reserve(Updates.size() + 2 * Successors.size() + 2);
+
+ // 'Successors' become successors of TransitionBB instead of BB,
+ // and TransitionBB becomes a single successor of BB.
+ Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB);
+ for (BasicBlock *Successor : Successors) {
+ Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor);
+ Updates.emplace_back(DominatorTree::Delete, BB, Successor);
+ }
+
+ // Create a branch that will always branch to the transition block and
+ // references DummyReturnBB.
+ BB->getTerminator()->eraseFromParent();
+ BranchInst::Create(TransitionBB, DummyReturnBB,
+ ConstantInt::getTrue(F.getContext()), BB);
+ Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB);
+}
+
bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT,
const PostDominatorTree &PDT,
const UniformityInfo &UA) {
- assert(hasOnlySimpleTerminator(F) && "Unsupported block terminator.");
-
if (PDT.root_size() == 0 ||
(PDT.root_size() == 1 &&
- !isa<BranchInst>(PDT.getRoot()->getTerminator())))
+ !isa<BranchInst, CallBrInst>(PDT.getRoot()->getTerminator())))
return false;
// Loop over all of the blocks in a function, tracking all of the blocks that
@@ -222,46 +260,28 @@ bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT,
if (HasDivergentExitBlock)
UnreachableBlocks.push_back(BB);
} else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
-
- ConstantInt *BoolTrue = ConstantInt::getTrue(F.getContext());
- if (DummyReturnBB == nullptr) {
- DummyReturnBB = BasicBlock::Create(F.getContext(),
- "DummyReturnBlock", &F);
- Type *RetTy = F.getReturnType();
- Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy);
- ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
- ReturningBlocks.push_back(DummyReturnBB);
- }
+ if (!DummyReturnBB)
+ DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks);
if (BI->isUnconditional()) {
BasicBlock *LoopHeaderBB = BI->getSuccessor(0);
BI->eraseFromParent(); // Delete the unconditional branch.
// Add a new conditional branch with a dummy edge to the return block.
- BranchInst::Create(LoopHeaderBB, DummyReturnBB, BoolTrue, BB);
- Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB);
- } else { // Conditional branch.
- SmallVector<BasicBlock *, 2> Successors(successors(BB));
-
- // Create a new transition block to hold the conditional branch.
- BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock");
-
- Updates.reserve(Updates.size() + 2 * Successors.size() + 2);
-
- // 'Successors' become successors of TransitionBB instead of BB,
- // and TransitionBB becomes a single successor of BB.
- Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB);
- for (BasicBlock *Successor : Successors) {
- Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor);
- Updates.emplace_back(DominatorTree::Delete, BB, Successor);
- }
-
- // Create a branch that will always branch to the transition block and
- // references DummyReturnBB.
- BB->getTerminator()->eraseFromParent();
- BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB);
+ BranchInst::Create(LoopHeaderBB, DummyReturnBB,
+ ConstantInt::getTrue(F.getContext()), BB);
Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB);
+ } else {
+ handleNBranch(F, BB, BI, DummyReturnBB, Updates);
}
Changed = true;
+ } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(BB->getTerminator())) {
+ if (!DummyReturnBB)
+ DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks);
+
+ handleNBranch(F, BB, CBI, DummyReturnBB, Updates);
+ Changed = true;
+ } else {
+ llvm_unreachable("unsupported block terminator");
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp b/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp
index 61c5dcd..faef408 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp
@@ -17,6 +17,7 @@
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIInstrInfo.h"
#include "llvm/ADT/SetVector.h"
+#include "llvm/TargetParser/TargetParser.h"
using namespace llvm;
@@ -44,6 +45,7 @@ namespace {
class AMDGPUWaitSGPRHazards {
public:
+ const GCNSubtarget *ST;
const SIInstrInfo *TII;
const SIRegisterInfo *TRI;
const MachineRegisterInfo *MRI;
@@ -54,7 +56,7 @@ public:
bool CullSGPRHazardsAtMemWait;
unsigned CullSGPRHazardsMemWaitThreshold;
- AMDGPUWaitSGPRHazards() {}
+ AMDGPUWaitSGPRHazards() = default;
// Return the numeric ID 0-127 for a given SGPR.
static std::optional<unsigned> sgprNumber(Register Reg,
@@ -165,7 +167,7 @@ public:
}
unsigned mergeMasks(unsigned Mask1, unsigned Mask2) {
- unsigned Mask = 0xffff;
+ unsigned Mask = AMDGPU::DepCtr::getDefaultDepCtrEncoding(*ST);
Mask = AMDGPU::DepCtr::encodeFieldSaSdst(
Mask, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(Mask1),
AMDGPU::DepCtr::decodeFieldSaSdst(Mask2)));
@@ -181,9 +183,12 @@ public:
Mask = AMDGPU::DepCtr::encodeFieldVaVdst(
Mask, std::min(AMDGPU::DepCtr::decodeFieldVaVdst(Mask1),
AMDGPU::DepCtr::decodeFieldVaVdst(Mask2)));
+ const AMDGPU::IsaVersion &Version = AMDGPU::getIsaVersion(ST->getCPU());
Mask = AMDGPU::DepCtr::encodeFieldHoldCnt(
- Mask, std::min(AMDGPU::DepCtr::decodeFieldHoldCnt(Mask1),
- AMDGPU::DepCtr::decodeFieldHoldCnt(Mask2)));
+ Mask,
+ std::min(AMDGPU::DepCtr::decodeFieldHoldCnt(Mask1, Version),
+ AMDGPU::DepCtr::decodeFieldHoldCnt(Mask2, Version)),
+ Version);
Mask = AMDGPU::DepCtr::encodeFieldVaSsrc(
Mask, std::min(AMDGPU::DepCtr::decodeFieldVaSsrc(Mask1),
AMDGPU::DepCtr::decodeFieldVaSsrc(Mask2)));
@@ -387,7 +392,7 @@ public:
// Apply wait
if (Wait) {
- unsigned Mask = 0xffff;
+ unsigned Mask = AMDGPU::DepCtr::getDefaultDepCtrEncoding(*ST);
if (Wait & WA_VCC) {
State.VCCHazard &= ~HazardState::VALU;
Mask = AMDGPU::DepCtr::encodeFieldVaVcc(Mask, 0);
@@ -438,8 +443,8 @@ public:
}
bool run(MachineFunction &MF) {
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- if (!ST.hasVALUReadSGPRHazard())
+ ST = &MF.getSubtarget<GCNSubtarget>();
+ if (!ST->hasVALUReadSGPRHazard())
return false;
// Parse settings
@@ -467,10 +472,10 @@ public:
if (!EnableSGPRHazardWaits)
return false;
- TII = ST.getInstrInfo();
- TRI = ST.getRegisterInfo();
+ TII = ST->getInstrInfo();
+ TRI = ST->getRegisterInfo();
MRI = &MF.getRegInfo();
- DsNopCount = ST.isWave64() ? WAVE64_NOPS : WAVE32_NOPS;
+ DsNopCount = ST->isWave64() ? WAVE64_NOPS : WAVE32_NOPS;
auto CallingConv = MF.getFunction().getCallingConv();
if (!AMDGPU::isEntryFunctionCC(CallingConv) &&
@@ -555,6 +560,6 @@ PreservedAnalyses
AMDGPUWaitSGPRHazardsPass::run(MachineFunction &MF,
MachineFunctionAnalysisManager &MFAM) {
if (AMDGPUWaitSGPRHazards().run(MF))
- return PreservedAnalyses::none();
+ return getMachineFunctionPassPreservedAnalyses();
return PreservedAnalyses::all();
}
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 99ba043..998a9d0 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -126,6 +126,7 @@ public:
ImmTySMEMOffsetMod,
ImmTyCPol,
ImmTyTFE,
+ ImmTyIsAsync,
ImmTyD16,
ImmTyClamp,
ImmTyOModSI,
@@ -143,10 +144,13 @@ public:
ImmTyExpTgt,
ImmTyExpCompr,
ImmTyExpVM,
+ ImmTyDone,
+ ImmTyRowEn,
ImmTyFORMAT,
ImmTyHwreg,
ImmTyOff,
ImmTySendMsg,
+ ImmTyWaitEvent,
ImmTyInterpSlot,
ImmTyInterpAttr,
ImmTyInterpAttrChan,
@@ -347,6 +351,11 @@ public:
return isRegKind() && getReg() == AMDGPU::SGPR_NULL;
}
+ bool isAV_LdSt_32_Align2_RegOp() const {
+ return isRegClass(AMDGPU::VGPR_32RegClassID) ||
+ isRegClass(AMDGPU::AGPR_32RegClassID);
+ }
+
bool isVRegWithInputMods() const;
template <bool IsFake16> bool isT16_Lo128VRegWithInputMods() const;
template <bool IsFake16> bool isT16VRegWithInputMods() const;
@@ -408,6 +417,8 @@ public:
bool isNegLo() const { return isImmTy(ImmTyNegLo); }
bool isNegHi() const { return isImmTy(ImmTyNegHi); }
bool isBitOp3() const { return isImmTy(ImmTyBitOp3) && isUInt<8>(getImm()); }
+ bool isDone() const { return isImmTy(ImmTyDone); }
+ bool isRowEn() const { return isImmTy(ImmTyRowEn); }
bool isRegOrImm() const {
return isReg() || isImm();
@@ -661,6 +672,8 @@ public:
bool isVSrc_v2f16() const { return isVSrc_f16() || isLiteralImm(MVT::v2f16); }
+ bool isVSrc_v2f16_splat() const { return isVSrc_v2f16(); }
+
bool isVSrc_NoInline_v2f16() const { return isVSrc_v2f16(); }
bool isVISrcB32() const {
@@ -956,6 +969,7 @@ public:
bool isSDelayALU() const;
bool isHwreg() const;
bool isSendMsg() const;
+ bool isWaitEvent() const;
bool isSplitBarrier() const;
bool isSwizzle() const;
bool isSMRDOffset8() const;
@@ -1108,6 +1122,7 @@ public:
case ImmTyIndexKey16bit: OS << "index_key"; break;
case ImmTyIndexKey32bit: OS << "index_key"; break;
case ImmTyTFE: OS << "TFE"; break;
+ case ImmTyIsAsync: OS << "IsAsync"; break;
case ImmTyD16: OS << "D16"; break;
case ImmTyFORMAT: OS << "FORMAT"; break;
case ImmTyClamp: OS << "Clamp"; break;
@@ -1133,8 +1148,11 @@ public:
case ImmTyExpTgt: OS << "ExpTgt"; break;
case ImmTyExpCompr: OS << "ExpCompr"; break;
case ImmTyExpVM: OS << "ExpVM"; break;
+ case ImmTyDone: OS << "Done"; break;
+ case ImmTyRowEn: OS << "RowEn"; break;
case ImmTyHwreg: OS << "Hwreg"; break;
case ImmTySendMsg: OS << "SendMsg"; break;
+ case ImmTyWaitEvent: OS << "WaitEvent"; break;
case ImmTyInterpSlot: OS << "InterpSlot"; break;
case ImmTyInterpAttr: OS << "InterpAttr"; break;
case ImmTyInterpAttrChan: OS << "InterpAttrChan"; break;
@@ -1544,6 +1562,12 @@ public:
bool isGFX1250() const { return AMDGPU::isGFX1250(getSTI()); }
+ bool isGFX1250Plus() const { return AMDGPU::isGFX1250Plus(getSTI()); }
+
+ bool isGFX13() const { return AMDGPU::isGFX13(getSTI()); }
+
+ bool isGFX13Plus() const { return AMDGPU::isGFX13Plus(getSTI()); }
+
bool isGFX10_AEncoding() const { return AMDGPU::isGFX10_AEncoding(getSTI()); }
bool isGFX10_BEncoding() const {
@@ -1675,7 +1699,8 @@ public:
ParseStatus
parseNamedBit(StringRef Name, OperandVector &Operands,
- AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone);
+ AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone,
+ bool IgnoreNegative = false);
unsigned getCPolKind(StringRef Id, StringRef Mnemo, bool &Disabling) const;
ParseStatus parseCPol(OperandVector &Operands);
ParseStatus parseScope(OperandVector &Operands, int64_t &Scope);
@@ -1763,7 +1788,7 @@ private:
bool IsSymbolic = false;
bool IsDefined = false;
- OperandInfoTy(int64_t Val) : Val(Val) {}
+ constexpr OperandInfoTy(int64_t Val) : Val(Val) {}
};
struct StructuredOpField : OperandInfoTy {
@@ -1772,8 +1797,8 @@ private:
unsigned Width;
bool IsDefined = false;
- StructuredOpField(StringLiteral Id, StringLiteral Desc, unsigned Width,
- int64_t Default)
+ constexpr StructuredOpField(StringLiteral Id, StringLiteral Desc,
+ unsigned Width, int64_t Default)
: OperandInfoTy(Default), Id(Id), Desc(Desc), Width(Width) {}
virtual ~StructuredOpField() = default;
@@ -1860,13 +1885,12 @@ private:
bool validateTHAndScopeBits(const MCInst &Inst, const OperandVector &Operands,
const unsigned CPol);
bool validateTFE(const MCInst &Inst, const OperandVector &Operands);
- bool validateSetVgprMSB(const MCInst &Inst, const OperandVector &Operands);
bool validateLdsDirect(const MCInst &Inst, const OperandVector &Operands);
bool validateWMMA(const MCInst &Inst, const OperandVector &Operands);
unsigned getConstantBusLimit(unsigned Opcode) const;
bool usesConstantBus(const MCInst &Inst, unsigned OpIdx);
bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const;
- unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const;
+ MCRegister findImplicitSGPRReadInVOP(const MCInst &Inst) const;
bool isSupportedMnemo(StringRef Mnemo,
const FeatureBitset &FBS);
@@ -1905,6 +1929,7 @@ public:
ParseStatus parseExpTgt(OperandVector &Operands);
ParseStatus parseSendMsg(OperandVector &Operands);
+ ParseStatus parseWaitEvent(OperandVector &Operands);
ParseStatus parseInterpSlot(OperandVector &Operands);
ParseStatus parseInterpAttr(OperandVector &Operands);
ParseStatus parseSOPPBrTarget(OperandVector &Operands);
@@ -2040,6 +2065,7 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) {
case AMDGPU::OPERAND_REG_INLINE_C_FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT:
case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
case AMDGPU::OPERAND_KIMM16:
return &APFloat::IEEEhalf();
@@ -2434,6 +2460,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
case AMDGPU::OPERAND_REG_IMM_V2INT16:
case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT:
case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
case AMDGPU::OPERAND_REG_IMM_V2FP32:
case AMDGPU::OPERAND_REG_IMM_V2INT32:
@@ -2476,6 +2503,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
case AMDGPU::OPERAND_REG_IMM_V2INT16:
case AMDGPU::OPERAND_REG_IMM_V2BF16:
case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT:
case AMDGPU::OPERAND_REG_IMM_V2FP32:
case AMDGPU::OPERAND_REG_IMM_V2INT32:
case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32:
@@ -2922,7 +2950,7 @@ MCRegister AMDGPUAsmParser::getRegularReg(RegisterKind RegKind, unsigned RegNum,
return AMDGPU::NoRegister;
}
- if (RegKind == IS_VGPR && !isGFX1250() && RegIdx + RegWidth / 32 > 256) {
+ if (RegKind == IS_VGPR && !isGFX1250Plus() && RegIdx + RegWidth / 32 > 256) {
Error(Loc, "register index is out of range");
return MCRegister();
}
@@ -3666,7 +3694,8 @@ StringRef AMDGPUAsmParser::getMatchedVariantName() const {
return "";
}
-unsigned AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const {
+MCRegister
+AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const {
const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
for (MCPhysReg Reg : Desc.implicit_uses()) {
switch (Reg) {
@@ -3680,7 +3709,7 @@ unsigned AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const {
break;
}
}
- return AMDGPU::NoRegister;
+ return MCRegister();
}
// NB: This code is correct only when used to check constant
@@ -3720,6 +3749,9 @@ bool AMDGPUAsmParser::isInlineConstant(const MCInst &Inst,
OperandType == AMDGPU::OPERAND_REG_IMM_V2FP16)
return AMDGPU::isInlinableLiteralV2F16(Val);
+ if (OperandType == AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT)
+ return AMDGPU::isPKFMACF16InlineConstant(Val, isGFX11Plus());
+
if (OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2BF16 ||
OperandType == AMDGPU::OPERAND_REG_IMM_V2BF16)
return AMDGPU::isInlinableLiteralV2BF16(Val);
@@ -3855,9 +3887,9 @@ bool AMDGPUAsmParser::validateConstantBusLimitations(
LiteralSize = 4;
}
- SmallDenseSet<unsigned> SGPRsUsed;
- unsigned SGPRUsed = findImplicitSGPRReadInVOP(Inst);
- if (SGPRUsed != AMDGPU::NoRegister) {
+ SmallDenseSet<MCRegister> SGPRsUsed;
+ MCRegister SGPRUsed = findImplicitSGPRReadInVOP(Inst);
+ if (SGPRUsed) {
SGPRsUsed.insert(SGPRUsed);
++ConstantBusUseCount;
}
@@ -3940,7 +3972,7 @@ AMDGPUAsmParser::checkVOPDRegBankConstraints(const MCInst &Inst, bool AsVOPD3) {
bool SkipSrc = Opcode == AMDGPU::V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 ||
Opcode == AMDGPU::V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 ||
Opcode == AMDGPU::V_DUAL_MOV_B32_e32_X_MOV_B32_e32_e96_gfx1250;
- bool AllowSameVGPR = isGFX1250();
+ bool AllowSameVGPR = isGFX1250Plus();
if (AsVOPD3) { // Literal constants are not allowed with VOPD3.
for (auto OpName : {OpName::src0X, OpName::src0Y}) {
@@ -4074,7 +4106,7 @@ bool AMDGPUAsmParser::tryVOPD(const MCInst &Inst) {
// form but switch to VOPD3 otherwise.
bool AMDGPUAsmParser::tryAnotherVOPDEncoding(const MCInst &Inst) {
const unsigned Opcode = Inst.getOpcode();
- if (!isGFX1250() || !isVOPD(Opcode))
+ if (!isGFX1250Plus() || !isVOPD(Opcode))
return false;
if (MII.get(Opcode).TSFlags & SIInstrFlags::VOPD3)
@@ -5364,7 +5396,7 @@ bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst,
unsigned CPol = Inst.getOperand(CPolPos).getImm();
- if (!isGFX1250()) {
+ if (!isGFX1250Plus()) {
if (CPol & CPol::SCAL) {
SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands);
StringRef CStr(S.getPointer());
@@ -5506,22 +5538,6 @@ bool AMDGPUAsmParser::validateTFE(const MCInst &Inst,
return true;
}
-bool AMDGPUAsmParser::validateSetVgprMSB(const MCInst &Inst,
- const OperandVector &Operands) {
- if (Inst.getOpcode() != AMDGPU::S_SET_VGPR_MSB_gfx12)
- return true;
-
- int Simm16Pos =
- AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::simm16);
- if ((unsigned)Inst.getOperand(Simm16Pos).getImm() > 255) {
- SMLoc Loc = Operands[1]->getStartLoc();
- Error(Loc, "s_set_vgpr_msb accepts values in range [0..255]");
- return false;
- }
-
- return true;
-}
-
bool AMDGPUAsmParser::validateWMMA(const MCInst &Inst,
const OperandVector &Operands) {
unsigned Opc = Inst.getOpcode();
@@ -5541,12 +5557,9 @@ bool AMDGPUAsmParser::validateWMMA(const MCInst &Inst,
if (RegSize == AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(Fmt) * 32)
return true;
- static const char *FmtNames[] = {"MATRIX_FMT_FP8", "MATRIX_FMT_BF8",
- "MATRIX_FMT_FP6", "MATRIX_FMT_BF6",
- "MATRIX_FMT_FP4"};
-
Error(getOperandLoc(Operands, SrcIdx),
- "wrong register tuple size for " + Twine(FmtNames[Fmt]));
+ "wrong register tuple size for " +
+ Twine(WMMAMods::ModMatrixFmt[Fmt]));
return false;
};
@@ -5681,9 +5694,6 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, SMLoc IDLoc,
if (!validateTFE(Inst, Operands)) {
return false;
}
- if (!validateSetVgprMSB(Inst, Operands)) {
- return false;
- }
if (!validateWMMA(Inst, Operands)) {
return false;
}
@@ -6182,7 +6192,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
return Error(IDRange.Start, "directive requires gfx90a+", IDRange);
AccumOffset = ExprVal;
} else if (ID == ".amdhsa_named_barrier_count") {
- if (!isGFX1250())
+ if (!isGFX1250Plus())
return Error(IDRange.Start, "directive requires gfx1250+", IDRange);
NamedBarCnt = ExprVal;
} else if (ID == ".amdhsa_reserve_vcc") {
@@ -6382,7 +6392,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
return TokError("amdgpu_user_sgpr_count smaller than than implied by "
"enabled user SGPRs");
- if (isGFX1250()) {
+ if (isGFX1250Plus()) {
if (!isUInt<COMPUTE_PGM_RSRC2_GFX125_USER_SGPR_COUNT_WIDTH>(UserSGPRCount))
return TokError("too many user SGPRs enabled");
AMDGPU::MCKernelDescriptor::bits_set(
@@ -6437,7 +6447,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
getContext());
}
- if (isGFX1250())
+ if (isGFX1250Plus())
MCKernelDescriptor::bits_set(KD.compute_pgm_rsrc3, NamedBarCnt,
COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT,
COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT,
@@ -7046,13 +7056,16 @@ ParseStatus AMDGPUAsmParser::parseOperandArrayWithPrefix(
ParseStatus AMDGPUAsmParser::parseNamedBit(StringRef Name,
OperandVector &Operands,
- AMDGPUOperand::ImmTy ImmTy) {
+ AMDGPUOperand::ImmTy ImmTy,
+ bool IgnoreNegative) {
int64_t Bit;
SMLoc S = getLoc();
if (trySkipId(Name)) {
Bit = 1;
} else if (trySkipId("no", Name)) {
+ if (IgnoreNegative)
+ return ParseStatus::Success;
Bit = 0;
} else {
return ParseStatus::NoMatch;
@@ -7063,6 +7076,12 @@ ParseStatus AMDGPUAsmParser::parseNamedBit(StringRef Name,
if (Name == "a16" && !hasA16())
return Error(S, "a16 modifier is not supported on this GPU");
+ if (Bit == 0 && Name == "gds") {
+ StringRef Mnemo = ((AMDGPUOperand &)*Operands[0]).getToken();
+ if (Mnemo.starts_with("ds_gws"))
+ return Error(S, "nogds is not allowed");
+ }
+
if (isGFX9() && ImmTy == AMDGPUOperand::ImmTyA16)
ImmTy = AMDGPUOperand::ImmTyR128A16;
@@ -7403,10 +7422,7 @@ ParseStatus AMDGPUAsmParser::parseIndexKey32bit(OperandVector &Operands) {
ParseStatus AMDGPUAsmParser::tryParseMatrixFMT(OperandVector &Operands,
StringRef Name,
AMDGPUOperand::ImmTy Type) {
- return parseStringOrIntWithPrefix(Operands, Name,
- {"MATRIX_FMT_FP8", "MATRIX_FMT_BF8",
- "MATRIX_FMT_FP6", "MATRIX_FMT_BF6",
- "MATRIX_FMT_FP4"},
+ return parseStringOrIntWithPrefix(Operands, Name, WMMAMods::ModMatrixFmt,
Type);
}
@@ -7423,8 +7439,8 @@ ParseStatus AMDGPUAsmParser::parseMatrixBFMT(OperandVector &Operands) {
ParseStatus AMDGPUAsmParser::tryParseMatrixScale(OperandVector &Operands,
StringRef Name,
AMDGPUOperand::ImmTy Type) {
- return parseStringOrIntWithPrefix(
- Operands, Name, {"MATRIX_SCALE_ROW0", "MATRIX_SCALE_ROW1"}, Type);
+ return parseStringOrIntWithPrefix(Operands, Name, WMMAMods::ModMatrixScale,
+ Type);
}
ParseStatus AMDGPUAsmParser::parseMatrixAScale(OperandVector &Operands) {
@@ -7440,10 +7456,8 @@ ParseStatus AMDGPUAsmParser::parseMatrixBScale(OperandVector &Operands) {
ParseStatus AMDGPUAsmParser::tryParseMatrixScaleFmt(OperandVector &Operands,
StringRef Name,
AMDGPUOperand::ImmTy Type) {
- return parseStringOrIntWithPrefix(
- Operands, Name,
- {"MATRIX_SCALE_FMT_E8", "MATRIX_SCALE_FMT_E5M3", "MATRIX_SCALE_FMT_E4M3"},
- Type);
+ return parseStringOrIntWithPrefix(Operands, Name, WMMAMods::ModMatrixScaleFmt,
+ Type);
}
ParseStatus AMDGPUAsmParser::parseMatrixAScaleFmt(OperandVector &Operands) {
@@ -8241,6 +8255,41 @@ bool AMDGPUOperand::isSendMsg() const {
return isImmTy(ImmTySendMsg);
}
+ParseStatus AMDGPUAsmParser::parseWaitEvent(OperandVector &Operands) {
+ using namespace llvm::AMDGPU::WaitEvent;
+
+ SMLoc Loc = getLoc();
+ int64_t ImmVal = 0;
+
+ StructuredOpField DontWaitExportReady("dont_wait_export_ready", "bit value",
+ 1, 0);
+ StructuredOpField ExportReady("export_ready", "bit value", 1, 0);
+
+ StructuredOpField *TargetBitfield =
+ isGFX11() ? &DontWaitExportReady : &ExportReady;
+
+ ParseStatus Res = parseStructuredOpFields({TargetBitfield});
+ if (Res.isNoMatch() && parseExpr(ImmVal, "structured immediate"))
+ Res = ParseStatus::Success;
+ else if (Res.isSuccess()) {
+ if (!validateStructuredOpFields({TargetBitfield}))
+ return ParseStatus::Failure;
+ ImmVal = TargetBitfield->Val;
+ }
+
+ if (!Res.isSuccess())
+ return ParseStatus::Failure;
+
+ if (!isUInt<16>(ImmVal))
+ return Error(Loc, "invalid immediate: only 16-bit values are legal");
+
+ Operands.push_back(AMDGPUOperand::CreateImm(this, ImmVal, Loc,
+ AMDGPUOperand::ImmTyWaitEvent));
+ return ParseStatus::Success;
+}
+
+bool AMDGPUOperand::isWaitEvent() const { return isImmTy(ImmTyWaitEvent); }
+
//===----------------------------------------------------------------------===//
// v_interp
//===----------------------------------------------------------------------===//
@@ -9048,6 +9097,9 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyCPol, 0);
+ // Parse a dummy operand as a placeholder for the SWZ operand. This enforces
+ // agreement between MCInstrDesc.getNumOperands and MCInst.getNumOperands.
+ Inst.addOperand(MCOperand::createImm(0));
}
//===----------------------------------------------------------------------===//
@@ -9514,6 +9566,8 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
Opc == AMDGPU::V_CVT_SCALEF32_PK_FP4_BF16_vi ||
Opc == AMDGPU::V_CVT_SR_BF8_F32_vi ||
Opc == AMDGPU::V_CVT_SR_FP8_F32_vi ||
+ Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_gfx11 ||
+ Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_gfx11 ||
Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_gfx12 ||
Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_gfx12) {
Inst.addOperand(MCOperand::createImm(0)); // Placeholder for src2_mods
@@ -9523,7 +9577,19 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
// Adding vdst_in operand is already covered for these DPP instructions in
// cvtVOP3DPP.
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in) &&
- !(Opc == AMDGPU::V_CVT_PK_BF8_F32_t16_e64_dpp_gfx12 ||
+ !(Opc == AMDGPU::V_CVT_PK_BF8_F32_t16_e64_dpp_gfx11 ||
+ Opc == AMDGPU::V_CVT_PK_FP8_F32_t16_e64_dpp_gfx11 ||
+ Opc == AMDGPU::V_CVT_PK_BF8_F32_t16_e64_dpp8_gfx11 ||
+ Opc == AMDGPU::V_CVT_PK_FP8_F32_t16_e64_dpp8_gfx11 ||
+ Opc == AMDGPU::V_CVT_PK_BF8_F32_fake16_e64_dpp_gfx11 ||
+ Opc == AMDGPU::V_CVT_PK_FP8_F32_fake16_e64_dpp_gfx11 ||
+ Opc == AMDGPU::V_CVT_PK_BF8_F32_fake16_e64_dpp8_gfx11 ||
+ Opc == AMDGPU::V_CVT_PK_FP8_F32_fake16_e64_dpp8_gfx11 ||
+ Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp_gfx11 ||
+ Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp8_gfx11 ||
+ Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp_gfx11 ||
+ Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp8_gfx11 ||
+ Opc == AMDGPU::V_CVT_PK_BF8_F32_t16_e64_dpp_gfx12 ||
Opc == AMDGPU::V_CVT_PK_FP8_F32_t16_e64_dpp_gfx12 ||
Opc == AMDGPU::V_CVT_PK_BF8_F32_t16_e64_dpp8_gfx12 ||
Opc == AMDGPU::V_CVT_PK_FP8_F32_t16_e64_dpp8_gfx12 ||
@@ -10439,7 +10505,7 @@ ParseStatus AMDGPUAsmParser::parseCustomOperand(OperandVector &Operands,
case MCK_addr64:
return parseTokenOp("addr64", Operands);
case MCK_done:
- return parseTokenOp("done", Operands);
+ return parseNamedBit("done", Operands, AMDGPUOperand::ImmTyDone, true);
case MCK_idxen:
return parseTokenOp("idxen", Operands);
case MCK_lds:
@@ -10449,7 +10515,7 @@ ParseStatus AMDGPUAsmParser::parseCustomOperand(OperandVector &Operands,
case MCK_off:
return parseTokenOp("off", Operands);
case MCK_row_95_en:
- return parseTokenOp("row_en", Operands);
+ return parseNamedBit("row_en", Operands, AMDGPUOperand::ImmTyRowEn, true);
case MCK_gds:
return parseNamedBit("gds", Operands, AMDGPUOperand::ImmTyGDS);
case MCK_tfe:
@@ -10480,6 +10546,10 @@ unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op,
return Operand.isOffen() ? Match_Success : Match_InvalidOperand;
case MCK_tfe:
return Operand.isTFE() ? Match_Success : Match_InvalidOperand;
+ case MCK_done:
+ return Operand.isDone() ? Match_Success : Match_InvalidOperand;
+ case MCK_row_95_en:
+ return Operand.isRowEn() ? Match_Success : Match_InvalidOperand;
case MCK_SSrc_b32:
// When operands have expression values, they will return true for isToken,
// because it is not possible to distinguish between a token and an
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index b97b738..568fff2 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -159,9 +159,9 @@ class MTBUF_Real <MTBUF_Pseudo ps, string real_name = ps.Mnemonic> :
}
class getMTBUFInsDA<list<RegisterOperand> vdataList,
- list<RegisterClass> vaddrList=[], bit hasRestrictedSOffset> {
+ list<RegisterOperand> vaddrList=[], bit hasRestrictedSOffset> {
RegisterOperand vdata_op = !if(!empty(vdataList), ?, !head(vdataList));
- RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
+ RegisterOperand vaddr_op = !if(!empty(vaddrList), ?, !head(vaddrList));
dag SOffset = !if(hasRestrictedSOffset, (ins SReg_32:$soffset),
(ins SCSrc_b32:$soffset));
@@ -171,7 +171,7 @@ class getMTBUFInsDA<list<RegisterOperand> vdataList,
dag Inputs = !if(!empty(vaddrList),
NonVaddrInputs,
- !con((ins vaddrClass:$vaddr), NonVaddrInputs));
+ !con((ins vaddr_op:$vaddr), NonVaddrInputs));
dag ret = !if(!empty(vdataList),
Inputs,
!con((ins vdata_op:$vdata), Inputs));
@@ -180,10 +180,10 @@ class getMTBUFInsDA<list<RegisterOperand> vdataList,
class getMTBUFIns<int addrKind, list<RegisterOperand> vdataList=[], bit hasRestrictedSOffset> {
dag ret =
!if(!eq(addrKind, BUFAddrKind.Offset), getMTBUFInsDA<vdataList, [], hasRestrictedSOffset>.ret,
- !if(!eq(addrKind, BUFAddrKind.OffEn), getMTBUFInsDA<vdataList, [VGPR_32], hasRestrictedSOffset>.ret,
- !if(!eq(addrKind, BUFAddrKind.IdxEn), getMTBUFInsDA<vdataList, [VGPR_32], hasRestrictedSOffset>.ret,
- !if(!eq(addrKind, BUFAddrKind.BothEn), getMTBUFInsDA<vdataList, [VReg_64], hasRestrictedSOffset>.ret,
- !if(!eq(addrKind, BUFAddrKind.Addr64), getMTBUFInsDA<vdataList, [VReg_64], hasRestrictedSOffset>.ret,
+ !if(!eq(addrKind, BUFAddrKind.OffEn), getMTBUFInsDA<vdataList, [VGPROp_32], hasRestrictedSOffset>.ret,
+ !if(!eq(addrKind, BUFAddrKind.IdxEn), getMTBUFInsDA<vdataList, [VGPROp_32], hasRestrictedSOffset>.ret,
+ !if(!eq(addrKind, BUFAddrKind.BothEn), getMTBUFInsDA<vdataList, [VGPROp_64], hasRestrictedSOffset>.ret,
+ !if(!eq(addrKind, BUFAddrKind.Addr64), getMTBUFInsDA<vdataList, [VGPROp_64], hasRestrictedSOffset>.ret,
(ins))))));
}
@@ -393,7 +393,7 @@ class MUBUF_Invalidate <string opName, SDPatternOperator node = null_frag> :
let sccb_value = 0;
}
-class getBUFVDataRegisterOperand<int Size, bit isTFE> {
+class getBUFVDataRegisterOperand<int Size, bit isTFE, bit isTrue16 = false> {
defvar tfeVDataOp =
!cond(!eq(Size, 16) : AVLdSt_64,
!eq(Size, 32) : AVLdSt_64,
@@ -402,7 +402,7 @@ class getBUFVDataRegisterOperand<int Size, bit isTFE> {
!eq(Size, 128) : AVLdSt_160);
defvar VDataOp =
- !cond(!eq(Size, 16) : AVLdSt_32,
+ !cond(!eq(Size, 16) : !if(isTrue16, VGPROp_16, AVLdSt_32),
!eq(Size, 32) : AVLdSt_32,
!eq(Size, 64) : AVLdSt_64,
!eq(Size, 96) : AVLdSt_96,
@@ -417,15 +417,17 @@ class getBUFVDataRegisterOperandForOp<RegisterOperand Op, bit isTFE> {
}
class getMUBUFInsDA<list<RegisterOperand> vdataList,
- list<RegisterClassLike> vaddrList, bit isTFE, bit hasRestrictedSOffset> {
+ list<RegisterOperand> vaddrList, bit isTFE, bit hasRestrictedSOffset,
+ bit isTrue16, bit isLds> {
RegisterOperand vdataClass = !if(!empty(vdataList), ?, !head(vdataList));
- RegisterClassLike vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
- RegisterOperand vdata_op = getBUFVDataRegisterOperand<!cast<SIRegisterClassLike>(vdataClass.RegClass).Size, isTFE>.ret;
+ RegisterOperand vaddr_op = !if(!empty(vaddrList), ?, !head(vaddrList));
+ RegisterOperand vdata_op = getBUFVDataRegisterOperand<!cast<SIRegisterClassLike>(vdataClass.RegClass).Size, isTFE, isTrue16>.ret;
dag SOffset = !if(hasRestrictedSOffset, (ins SReg_32:$soffset), (ins SCSrc_b32:$soffset));
- dag NonVaddrInputs = !con((ins SReg_128_XNULL:$srsrc), SOffset, (ins Offset:$offset, CPol_0:$cpol, i1imm_0:$swz));
+ dag IsAsyncOpnd = !if(isLds, (ins i1imm_0:$IsAsync), (ins));
+ dag NonVaddrInputs = !con((ins SReg_128_XNULL:$srsrc), SOffset, (ins Offset:$offset, CPol_0:$cpol, i1imm_0:$swz), IsAsyncOpnd);
- dag Inputs = !if(!empty(vaddrList), NonVaddrInputs, !con((ins vaddrClass:$vaddr), NonVaddrInputs));
+ dag Inputs = !if(!empty(vaddrList), NonVaddrInputs, !con((ins vaddr_op:$vaddr), NonVaddrInputs));
dag ret = !if(!empty(vdataList), Inputs, !con((ins vdata_op:$vdata), Inputs));
}
@@ -448,13 +450,14 @@ class getMUBUFElements<ValueType vt> {
);
}
-class getMUBUFIns<int addrKind, list<RegisterOperand> vdataList, bit isTFE, bit hasRestrictedSOffset> {
+class getMUBUFIns<int addrKind, list<RegisterOperand> vdataList, bit isTFE,
+ bit hasRestrictedSOffset, bit isTrue16, bit isLds> {
dag ret =
- !if(!eq(addrKind, BUFAddrKind.Offset), getMUBUFInsDA<vdataList, [], isTFE, hasRestrictedSOffset>.ret,
- !if(!eq(addrKind, BUFAddrKind.OffEn), getMUBUFInsDA<vdataList, [VGPR_32], isTFE, hasRestrictedSOffset>.ret,
- !if(!eq(addrKind, BUFAddrKind.IdxEn), getMUBUFInsDA<vdataList, [VGPR_32], isTFE, hasRestrictedSOffset>.ret,
- !if(!eq(addrKind, BUFAddrKind.BothEn), getMUBUFInsDA<vdataList, [VReg_64_AlignTarget], isTFE, hasRestrictedSOffset>.ret,
- !if(!eq(addrKind, BUFAddrKind.Addr64), getMUBUFInsDA<vdataList, [VReg_64_AlignTarget], isTFE, hasRestrictedSOffset>.ret,
+ !if(!eq(addrKind, BUFAddrKind.Offset), getMUBUFInsDA<vdataList, [], isTFE, hasRestrictedSOffset, isTrue16, isLds>.ret,
+ !if(!eq(addrKind, BUFAddrKind.OffEn), getMUBUFInsDA<vdataList, [VGPROp_32], isTFE, hasRestrictedSOffset, isTrue16, isLds>.ret,
+ !if(!eq(addrKind, BUFAddrKind.IdxEn), getMUBUFInsDA<vdataList, [VGPROp_32], isTFE, hasRestrictedSOffset, isTrue16, isLds>.ret,
+ !if(!eq(addrKind, BUFAddrKind.BothEn), getMUBUFInsDA<vdataList, [VGPROp_64], isTFE, hasRestrictedSOffset, isTrue16, isLds>.ret,
+ !if(!eq(addrKind, BUFAddrKind.Addr64), getMUBUFInsDA<vdataList, [VGPROp_64], isTFE, hasRestrictedSOffset, isTrue16, isLds>.ret,
(ins))))));
}
@@ -499,7 +502,7 @@ class MUBUF_Load_Pseudo <string opName,
RegisterOperand vdata_op = getBUFVDataRegisterOperand<vdata_vt.Size, isTFE>.ret>
: MUBUF_Pseudo<opName,
!if(!or(isLds, isLdsOpc), (outs), (outs vdata_op:$vdata)),
- !con(getMUBUFIns<addrKind, [], isTFE, hasRestrictedSOffset>.ret,
+ !con(getMUBUFIns<addrKind, [], isTFE, hasRestrictedSOffset, 0, isLds>.ret,
!if(HasTiedDest, (ins vdata_op:$vdata_in), (ins))),
getMUBUFAsmOps<addrKind, !or(isLds, isLdsOpc), isLds, isTFE>.ret,
pattern>,
@@ -509,7 +512,7 @@ class MUBUF_Load_Pseudo <string opName,
let AsmMatchConverter = "cvtMubuf";
let Constraints = !if(HasTiedDest, "$vdata = $vdata_in", "");
- let LGKM_CNT = isLds;
+ let LGKM_CNT = 0;
let has_vdata = !not(!or(isLds, isLdsOpc));
let mayLoad = 1;
let mayStore = isLds;
@@ -566,6 +569,33 @@ multiclass MUBUF_Pseudo_Loads_Helper<string opName, ValueType load_vt,
}
}
+multiclass MUBUF_Pseudo_Loads_Helper_t16<string opName, string Hi16Name, string Lo16Name, ValueType load_vt,
+ bit TiedDest, bit isLds, bit isTFE, bit hasRestrictedSOffset> {
+ def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, load_vt, TiedDest, isLds, 0, isTFE, hasRestrictedSOffset, [], VGPROp_16>,
+ MUBUFAddr64Table<0, NAME # !if(isLds, "_LDS", "")>, True16D16Table<Hi16Name#"_OFFSET", Lo16Name#"_OFFSET">;
+
+ def _ADDR64 : MUBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, load_vt, TiedDest, isLds, 0, isTFE, hasRestrictedSOffset, [], VGPROp_16>,
+ MUBUFAddr64Table<1, NAME # !if(isLds, "_LDS", "")>, True16D16Table<Hi16Name#"_ADDR64", Lo16Name#"_ADDR64">;
+
+ def _OFFEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, load_vt, TiedDest, isLds, 0, isTFE, hasRestrictedSOffset, [], VGPROp_16>,
+ True16D16Table<Hi16Name#"_OFFEN", Lo16Name#"_OFFEN">;
+ def _IDXEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, load_vt, TiedDest, isLds, 0, isTFE, hasRestrictedSOffset, [], VGPROp_16>,
+ True16D16Table<Hi16Name#"_IDXEN", Lo16Name#"_IDXEN">;
+ def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, load_vt, TiedDest, isLds, 0, isTFE, hasRestrictedSOffset, [], VGPROp_16>,
+ True16D16Table<Hi16Name#"_BOTHEN", Lo16Name#"_BOTHEN">;
+
+ let DisableWQM = 1 in {
+ def _OFFSET_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, load_vt, TiedDest, isLds, 0, isTFE, hasRestrictedSOffset, [], VGPROp_16>,
+ True16D16Table<Hi16Name#"_OFFSET_exact", Lo16Name#"_OFFSET_exact">;
+ def _OFFEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, load_vt, TiedDest, isLds, 0, isTFE, hasRestrictedSOffset, [], VGPROp_16>,
+ True16D16Table<Hi16Name#"_OFFEN_exact", Lo16Name#"_OFFEN_exact">;
+ def _IDXEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, load_vt, TiedDest, isLds, 0, isTFE, hasRestrictedSOffset, [], VGPROp_16>,
+ True16D16Table<Hi16Name#"_IDXEN_exact", Lo16Name#"_IDXEN_exact">;
+ def _BOTHEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, load_vt, TiedDest, isLds, 0, isTFE, hasRestrictedSOffset, [], VGPROp_16>,
+ True16D16Table<Hi16Name#"_BOTHEN_exact", Lo16Name#"_BOTHEN_exact">;
+ }
+}
+
multiclass MUBUF_Pseudo_Loads<string opName, ValueType load_vt = i32,
bit TiedDest = 0, bit isLds = 0> {
defm NAME : MUBUF_Pseudo_Loads_Helper<opName, load_vt, TiedDest, isLds, 0, 0>;
@@ -577,6 +607,23 @@ multiclass MUBUF_Pseudo_Loads<string opName, ValueType load_vt = i32,
}
}
+multiclass MUBUF_Pseudo_Loads_t16<string opName, ValueType load_vt = i32,
+ bit TiedDest = 0, bit isLds = 0, string hiOpName = NAME#"_HI"> {
+ let True16Predicate = NotUseRealTrue16Insts in {
+ defm NAME : MUBUF_Pseudo_Loads_Helper<opName, load_vt, TiedDest, isLds, 0, 0>;
+ defm _VBUFFER : MUBUF_Pseudo_Loads_Helper<opName, load_vt, TiedDest, isLds, 0, 1>;
+ }
+ let True16Predicate = UseRealTrue16Insts in {
+ defvar NAME16 = opName#"_t16";
+ defm _t16 : MUBUF_Pseudo_Loads_Helper_t16<NAME16, hiOpName, NAME, i16, 0, isLds, 0, 0>;
+ defm _t16_VBUFFER : MUBUF_Pseudo_Loads_Helper_t16<NAME16, hiOpName#"_VBUFFER", NAME#"_VBUFFER", i16, 0, isLds, 0, 1>;
+ }
+ if !not(isLds) then {
+ defm _TFE : MUBUF_Pseudo_Loads_Helper<opName, load_vt, TiedDest, isLds, 1, 0>;
+ defm _TFE_VBUFFER : MUBUF_Pseudo_Loads_Helper<opName, load_vt, TiedDest, isLds, 1, 1>;
+ }
+}
+
multiclass MUBUF_Pseudo_Loads_Lds<string opName, ValueType load_vt = i32, Predicate LDSPred = TruePredicate> {
defm NAME : MUBUF_Pseudo_Loads<opName, load_vt>;
@@ -595,10 +642,11 @@ class MUBUF_Store_Pseudo <string opName,
ValueType store_vt,
bit isTFE = 0,
bit hasRestrictedSOffset = 0,
- list<dag> pattern=[]>
+ list<dag> pattern=[],
+ bit isTrue16 = false>
: MUBUF_Pseudo<opName,
(outs),
- getMUBUFIns<addrKind, [getVregSrcForVT<store_vt>.ret], isTFE, hasRestrictedSOffset>.ret,
+ getMUBUFIns<addrKind, [getVregSrcForVT<store_vt, isTrue16, 0>.ret], isTFE, hasRestrictedSOffset, isTrue16, 0>.ret,
getMUBUFAsmOps<addrKind, 0, 0, isTFE>.ret,
pattern>,
MUBUF_SetupAddr<addrKind> {
@@ -650,6 +698,33 @@ multiclass MUBUF_Pseudo_Stores_Helper<string opName, ValueType store_vt,
}
}
+multiclass MUBUF_Pseudo_Stores_Helper_t16<string opName, string Hi16Name, string Lo16Name, ValueType store_vt,
+ bit isTFE, bit hasRestrictedSOffset> {
+ def _OFFSET : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, store_vt, isTFE, hasRestrictedSOffset, [], 1/*isTrue16*/>,
+ MUBUFAddr64Table<0, NAME>, True16D16Table<Hi16Name#"_OFFSET", Lo16Name#"_OFFSET">;
+
+ def _ADDR64 : MUBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, store_vt, isTFE, hasRestrictedSOffset, [], 1/*isTrue16*/>,
+ MUBUFAddr64Table<1, NAME>, True16D16Table<Hi16Name#"_ADDR64", Lo16Name#"_ADDR64">;
+
+ def _OFFEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, store_vt, isTFE, hasRestrictedSOffset, [], 1/*isTrue16*/>,
+ True16D16Table<Hi16Name#"_OFFEN", Lo16Name#"_OFFEN">;
+ def _IDXEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, store_vt, isTFE, hasRestrictedSOffset, [], 1/*isTrue16*/>,
+ True16D16Table<Hi16Name#"_IDXEN", Lo16Name#"_IDXEN">;
+ def _BOTHEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, store_vt, isTFE, hasRestrictedSOffset, [], 1/*isTrue16*/>,
+ True16D16Table<Hi16Name#"_BOTHEN", Lo16Name#"_BOTHEN">;
+
+ let DisableWQM = 1 in {
+ def _OFFSET_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, store_vt, isTFE, hasRestrictedSOffset, [], 1/*isTrue16*/>,
+ True16D16Table<Hi16Name#"_OFFSET_exact", Lo16Name#"_OFFSET_exact">;
+ def _OFFEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, store_vt, isTFE, hasRestrictedSOffset, [], 1/*isTrue16*/>,
+ True16D16Table<Hi16Name#"_OFFEN_exact", Lo16Name#"_OFFEN_exact">;
+ def _IDXEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, store_vt, isTFE, hasRestrictedSOffset, [], 1/*isTrue16*/>,
+ True16D16Table<Hi16Name#"_IDXEN_exact", Lo16Name#"_IDXEN_exact">;
+ def _BOTHEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, store_vt, isTFE, hasRestrictedSOffset, [], 1/*isTrue16*/>,
+ True16D16Table<Hi16Name#"_BOTHEN_exact", Lo16Name#"_BOTHEN_exact">;
+ }
+}
+
multiclass MUBUF_Pseudo_Stores<string opName, ValueType store_vt = i32> {
defm NAME : MUBUF_Pseudo_Stores_Helper<opName, store_vt, 0, 0>;
defm _TFE : MUBUF_Pseudo_Stores_Helper<opName, store_vt, 1, 0>;
@@ -658,6 +733,22 @@ multiclass MUBUF_Pseudo_Stores<string opName, ValueType store_vt = i32> {
defm _TFE_VBUFFER : MUBUF_Pseudo_Stores_Helper<opName, store_vt, 1, 1>;
}
+multiclass MUBUF_Pseudo_Stores_t16<string opName, ValueType store_vt = i32> {
+ defm _TFE : MUBUF_Pseudo_Stores_Helper<opName, store_vt, 1, 0>;
+ defm _TFE_VBUFFER : MUBUF_Pseudo_Stores_Helper<opName, store_vt, 1, 1>;
+
+ let True16Predicate = NotUseRealTrue16Insts in {
+ defm NAME : MUBUF_Pseudo_Stores_Helper<opName, store_vt, 0, 0>;
+
+ defm _VBUFFER : MUBUF_Pseudo_Stores_Helper<opName, store_vt, 0, 1>;
+ }
+ let True16Predicate = UseRealTrue16Insts, SubtargetPredicate = HasD16LoadStore in {
+ defvar NAME16 = opName#"_t16";
+ defm _t16 : MUBUF_Pseudo_Stores_Helper_t16<NAME16, NAME#"_D16_HI", NAME, i16, 0, 0>;
+ defm _t16_VBUFFER : MUBUF_Pseudo_Stores_Helper_t16<NAME16, NAME#"_D16_HI_VBUFFER", NAME#"_VBUFFER", i16, 0, 1>;
+ }
+}
+
class MUBUF_Pseudo_Store_Lds<string opName>
: MUBUF_Pseudo<opName,
(outs),
@@ -677,11 +768,11 @@ class MUBUF_Pseudo_Store_Lds<string opName>
}
class getMUBUFAtomicInsDA<RegisterOperand vdata_op, bit vdata_in, bit hasRestrictedSOffset,
- list<RegisterClassLike> vaddrList=[]> {
- RegisterClassLike vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
+ list<RegisterOperand> vaddrList=[]> {
+ RegisterOperand vaddr_op = !if(!empty(vaddrList), ?, !head(vaddrList));
dag VData = !if(vdata_in, (ins vdata_op:$vdata_in), (ins vdata_op:$vdata));
- dag Data = !if(!empty(vaddrList), VData, !con(VData, (ins vaddrClass:$vaddr)));
+ dag Data = !if(!empty(vaddrList), VData, !con(VData, (ins vaddr_op:$vaddr)));
dag SOffset = !if(hasRestrictedSOffset, (ins SReg_32:$soffset), (ins SCSrc_b32:$soffset));
dag MainInputs = !con((ins SReg_128_XNULL:$srsrc), SOffset, (ins Offset:$offset));
dag CPol = !if(vdata_in, (ins CPol_GLC_WithDefault:$cpol),
@@ -698,13 +789,13 @@ class getMUBUFAtomicIns<int addrKind,
!if(!eq(addrKind, BUFAddrKind.Offset),
getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset>.ret,
!if(!eq(addrKind, BUFAddrKind.OffEn),
- getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VGPR_32]>.ret,
+ getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VGPROp_32]>.ret,
!if(!eq(addrKind, BUFAddrKind.IdxEn),
- getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VGPR_32]>.ret,
+ getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VGPROp_32]>.ret,
!if(!eq(addrKind, BUFAddrKind.BothEn),
- getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VReg_64_AlignTarget]>.ret,
+ getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VGPROp_64]>.ret,
!if(!eq(addrKind, BUFAddrKind.Addr64),
- getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VReg_64_AlignTarget]>.ret,
+ getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VGPROp_64]>.ret,
(ins))))));
}
@@ -783,37 +874,20 @@ multiclass MUBUF_Pseudo_Atomics_NO_RTN <string opName,
multiclass MUBUF_Pseudo_Atomics_RTN <string opName,
RegisterOperand vdataClass,
- ValueType vdataType,
- SDPatternOperator atomic> {
+ ValueType vdataType> {
let FPAtomic = vdataType.isFP in {
- def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass, 0,
- [(set vdataType:$vdata,
- (atomic (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset),
- vdataType:$vdata_in))]>,
- MUBUFAddr64Table <0, NAME # "_RTN">;
-
- def _ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, 0,
- [(set vdataType:$vdata,
- (atomic (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset),
- vdataType:$vdata_in))]>,
- MUBUFAddr64Table <1, NAME # "_RTN">;
-
+ def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass, 0>,
+ MUBUFAddr64Table <0, NAME # "_RTN">;
+ def _ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, 0>,
+ MUBUFAddr64Table <1, NAME # "_RTN">;
def _OFFEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, 0>;
def _IDXEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, 0>;
def _BOTHEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, 0>;
- def _VBUFFER_OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.Offset, vdataClass, 1,
- [(set vdataType:$vdata,
- (atomic (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset),
- vdataType:$vdata_in))]>,
- MUBUFAddr64Table <0, NAME # "_VBUFFER_RTN">;
-
- def _VBUFFER_ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.Addr64, vdataClass, 1,
- [(set vdataType:$vdata,
- (atomic (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset),
- vdataType:$vdata_in))]>,
- MUBUFAddr64Table <1, NAME # "_VBUFFER_RTN">;
-
+ def _VBUFFER_OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.Offset, vdataClass, 1>,
+ MUBUFAddr64Table <0, NAME # "_VBUFFER_RTN">;
+ def _VBUFFER_ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.Addr64, vdataClass, 1>,
+ MUBUFAddr64Table <1, NAME # "_VBUFFER_RTN">;
def _VBUFFER_OFFEN_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.OffEn, vdataClass, 1>;
def _VBUFFER_IDXEN_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.IdxEn, vdataClass, 1>;
def _VBUFFER_BOTHEN_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.BothEn, vdataClass, 1>;
@@ -822,10 +896,9 @@ multiclass MUBUF_Pseudo_Atomics_RTN <string opName,
multiclass MUBUF_Pseudo_Atomics <string opName,
RegisterOperand vdataClass,
- ValueType vdataType,
- SDPatternOperator atomic = null_frag> :
+ ValueType vdataType> :
MUBUF_Pseudo_Atomics_NO_RTN<opName, vdataClass, vdataType>,
- MUBUF_Pseudo_Atomics_RTN<opName, vdataClass, vdataType, atomic>;
+ MUBUF_Pseudo_Atomics_RTN<opName, vdataClass, vdataType>;
//===----------------------------------------------------------------------===//
@@ -889,10 +962,16 @@ let TiedSourceNotRead = 1 in {
>;
} // End OtherPredicates = [HasUnpackedD16VMem], D16Buf = 1.
+
+let TiedSourceNotRead = 1, SubtargetPredicate = HasD16LoadStore, OtherPredicates = [HasFormattedMUBUFInsts] in
+defm BUFFER_LOAD_FORMAT_D16_HI_X : MUBUF_Pseudo_Loads <
+ "buffer_load_format_d16_hi_x", i32
+>;
+
let OtherPredicates = [HasPackedD16VMem], D16Buf = 1 in {
let TiedSourceNotRead = 1 in {
- defm BUFFER_LOAD_FORMAT_D16_X : MUBUF_Pseudo_Loads <
- "buffer_load_format_d16_x", f16
+ defm BUFFER_LOAD_FORMAT_D16_X : MUBUF_Pseudo_Loads_t16 <
+ "buffer_load_format_d16_x", f16, 0, 0, "BUFFER_LOAD_FORMAT_D16_HI_X"
>;
defm BUFFER_LOAD_FORMAT_D16_XY : MUBUF_Pseudo_Loads <
"buffer_load_format_d16_xy", v2f16
@@ -948,9 +1027,6 @@ defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, atomic_load_aext_8_globa
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, atomic_load_zext_8_global>;
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i32, atomic_load_aext_16_global>;
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i32, atomic_load_zext_16_global>;
-defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i16, atomic_load_aext_8_global>;
-defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i16, atomic_load_zext_8_global>;
-defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i16, atomic_load_nonext_16_global>;
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, extloadi8_global>;
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, zextloadi8_global>;
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_SBYTE", i32, sextloadi8_global>;
@@ -977,12 +1053,23 @@ foreach vt = VReg_128.RegTypes in {
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX4", vt, load_global>;
}
-defm BUFFER_STORE_BYTE : MUBUF_Pseudo_Stores <
+let SubtargetPredicate = HasD16LoadStore in {
+defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Pseudo_Stores <
+ "buffer_store_byte_d16_hi", i32
+>;
+
+defm BUFFER_STORE_SHORT_D16_HI : MUBUF_Pseudo_Stores <
+ "buffer_store_short_d16_hi", i32
+>;
+}
+
+defm BUFFER_STORE_BYTE : MUBUF_Pseudo_Stores_t16 <
"buffer_store_byte", i32
>;
-defm BUFFER_STORE_SHORT : MUBUF_Pseudo_Stores <
+defm BUFFER_STORE_SHORT : MUBUF_Pseudo_Stores_t16 <
"buffer_store_short", i32
>;
+
defm BUFFER_STORE_DWORD : MUBUF_Pseudo_Stores <
"buffer_store_dword", i32
>;
@@ -1096,7 +1183,7 @@ defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Pseudo_Atomics <
let OtherPredicates = [HasGFX10_BEncoding] in {
defm BUFFER_ATOMIC_CSUB : MUBUF_Pseudo_Atomics <
- "buffer_atomic_csub", VGPROp_32, i32, int_amdgcn_global_atomic_csub
+ "buffer_atomic_csub", VGPROp_32, i32
>;
}
@@ -1117,65 +1204,52 @@ def BUFFER_WBINVL1_SC : MUBUF_Invalidate <"buffer_wbinvl1_sc",
let SubtargetPredicate = isGFX6GFX7GFX10Plus in {
defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Pseudo_Atomics <
- "buffer_atomic_fcmpswap", AVLdSt_64, v2f32, null_frag
+ "buffer_atomic_fcmpswap", AVLdSt_64, v2f32
>;
}
let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts in {
defm BUFFER_ATOMIC_FMIN : MUBUF_Pseudo_Atomics <
- "buffer_atomic_fmin", AVLdSt_32, f32, null_frag
+ "buffer_atomic_fmin", AVLdSt_32, f32
>;
defm BUFFER_ATOMIC_FMAX : MUBUF_Pseudo_Atomics <
- "buffer_atomic_fmax", AVLdSt_32, f32, null_frag
+ "buffer_atomic_fmax", AVLdSt_32, f32
>;
}
let SubtargetPredicate = isGFX6GFX7GFX10 in {
defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_fcmpswap_x2", VGPROp_128, v2f64, null_frag
+ "buffer_atomic_fcmpswap_x2", VGPROp_128, v2f64
>;
}
let SubtargetPredicate = HasD16LoadStore in {
let TiedSourceNotRead = 1 in {
-defm BUFFER_LOAD_UBYTE_D16 : MUBUF_Pseudo_Loads <
- "buffer_load_ubyte_d16", i32, 1
->;
-
defm BUFFER_LOAD_UBYTE_D16_HI : MUBUF_Pseudo_Loads <
"buffer_load_ubyte_d16_hi", i32, 1
>;
-defm BUFFER_LOAD_SBYTE_D16 : MUBUF_Pseudo_Loads <
- "buffer_load_sbyte_d16", i32, 1
->;
-
defm BUFFER_LOAD_SBYTE_D16_HI : MUBUF_Pseudo_Loads <
"buffer_load_sbyte_d16_hi", i32, 1
>;
-defm BUFFER_LOAD_SHORT_D16 : MUBUF_Pseudo_Loads <
- "buffer_load_short_d16", i32, 1
->;
-
defm BUFFER_LOAD_SHORT_D16_HI : MUBUF_Pseudo_Loads <
"buffer_load_short_d16_hi", i32, 1
>;
-let OtherPredicates = [HasFormattedMUBUFInsts] in
-defm BUFFER_LOAD_FORMAT_D16_HI_X : MUBUF_Pseudo_Loads <
- "buffer_load_format_d16_hi_x", i32
+defm BUFFER_LOAD_UBYTE_D16 : MUBUF_Pseudo_Loads_t16 <
+ "buffer_load_ubyte_d16", i32, 1
>;
-} // End TiedSourceNotRead
-defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Pseudo_Stores <
- "buffer_store_byte_d16_hi", i32
+defm BUFFER_LOAD_SBYTE_D16 : MUBUF_Pseudo_Loads_t16 <
+ "buffer_load_sbyte_d16", i32, 1
>;
-defm BUFFER_STORE_SHORT_D16_HI : MUBUF_Pseudo_Stores <
- "buffer_store_short_d16_hi", i32
+defm BUFFER_LOAD_SHORT_D16 : MUBUF_Pseudo_Loads_t16 <
+ "buffer_load_short_d16", i32, 1
>;
+} // End TiedSourceNotRead
let OtherPredicates = [HasFormattedMUBUFInsts] in
defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Pseudo_Stores <
@@ -1184,6 +1258,18 @@ defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Pseudo_Stores <
} // End HasD16LoadStore
+let True16Predicate = NotUseRealTrue16Insts in {
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i16, atomic_load_aext_8_global>;
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i16, atomic_load_zext_8_global>;
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i16, atomic_load_nonext_16_global>;
+}
+
+let True16Predicate = UseRealTrue16Insts in {
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE_D16_t16", i16, atomic_load_aext_8_global>;
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE_D16_t16", i16, atomic_load_zext_8_global>;
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_SHORT_D16_t16", i16, atomic_load_nonext_16_global>;
+}
+
let SubtargetPredicate = isNotGFX940Plus in
def BUFFER_WBINVL1 : MUBUF_Invalidate <
"buffer_wbinvl1", int_amdgcn_buffer_wbinvl1
@@ -1201,12 +1287,12 @@ defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN <
let SubtargetPredicate = HasAtomicFaddRtnInsts in
defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_RTN<
- "buffer_atomic_add_f32", AVLdSt_32, f32, null_frag
+ "buffer_atomic_add_f32", AVLdSt_32, f32
>;
let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts in
defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_RTN <
- "buffer_atomic_pk_add_f16", AVLdSt_32, v2f16, null_frag
+ "buffer_atomic_pk_add_f16", AVLdSt_32, v2f16
>;
let SubtargetPredicate = isGFX12Plus in {
@@ -1385,8 +1471,14 @@ let OtherPredicates = [HasUnpackedD16VMem, HasFormattedMUBUFInsts] in {
} // End OtherPredicates = [HasUnpackedD16VMem, HasFormattedMUBUFInsts].
let OtherPredicates = [HasPackedD16VMem, HasFormattedMUBUFInsts] in {
+let True16Predicate = NotUseRealTrue16Insts in {
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, f16, "BUFFER_LOAD_FORMAT_D16_X">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i16, "BUFFER_LOAD_FORMAT_D16_X">;
+}
+let True16Predicate = UseRealTrue16Insts in {
+ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, f16, "BUFFER_LOAD_FORMAT_D16_X_t16">;
+ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i16, "BUFFER_LOAD_FORMAT_D16_X_t16">;
+}
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i32, "BUFFER_LOAD_FORMAT_D16_X">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2f16, "BUFFER_LOAD_FORMAT_D16_XY">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2i16, "BUFFER_LOAD_FORMAT_D16_XY">;
@@ -1952,15 +2044,26 @@ multiclass MUBUFLoad_PatternOffset <string Instr, ValueType vt,
}
let OtherPredicates = [Has16BitInsts] in {
-
+let True16Predicate = NotUseRealTrue16Insts in {
defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_SBYTE", i16, sextloadi8_constant>;
defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_UBYTE", i16, extloadi8_constant>;
defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_UBYTE", i16, zextloadi8_constant>;
defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_SBYTE", i16, sextloadi8_global>;
defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_UBYTE", i16, extloadi8_global>;
defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_UBYTE", i16, zextloadi8_global>;
-
defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_USHORT", i16, load_global>;
+}
+
+let True16Predicate = UseRealTrue16Insts in {
+defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_SBYTE_D16_t16", i16, sextloadi8_constant>;
+defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_UBYTE_D16_t16", i16, extloadi8_constant>;
+defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_UBYTE_D16_t16", i16, zextloadi8_constant>;
+defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_SBYTE_D16_t16", i16, sextloadi8_global>;
+defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_UBYTE_D16_t16", i16, extloadi8_global>;
+defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_UBYTE_D16_t16", i16, zextloadi8_global>;
+defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_SHORT_D16_t16", i16, load_global>;
+}
+
} // End OtherPredicates = [Has16BitInsts]
@@ -2000,6 +2103,19 @@ multiclass MUBUFScratchLoadPat_D16_Common <string Instr,
>;
}
+multiclass MUBUFScratchLoadPat_D16_Common_t16 <string Instr, ValueType vt, PatFrag ld_frag> {
+ def : GCNPat <
+ (vt (ld_frag (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, i32:$offset))),
+ (!cast<MUBUF_Pseudo>(Instr # _OFFEN) $vaddr, $srsrc, $soffset, $offset)
+ >;
+
+ def : GCNPat <
+ (vt (ld_frag (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, i32:$offset))),
+ (!cast<MUBUF_Pseudo>(Instr # _OFFSET) $srsrc, $soffset, $offset)
+ >;
+}
+
+
multiclass MUBUFScratchLoadPat_D16 <string Instr,
ValueType vt, PatFrag ld_frag> {
let SubtargetPredicate = HasUnrestrictedSOffset in {
@@ -2008,17 +2124,35 @@ multiclass MUBUFScratchLoadPat_D16 <string Instr,
defm : MUBUFScratchLoadPat_D16_Common<Instr # "_VBUFFER", vt, ld_frag>;
}
-let OtherPredicates = [DisableFlatScratch] in {
+multiclass MUBUFScratchLoadPat_D16_t16 <string Instr,
+ ValueType vt, PatFrag ld_frag> {
+ let SubtargetPredicate = HasUnrestrictedSOffset in {
+ defm : MUBUFScratchLoadPat_D16_Common_t16<Instr, vt, ld_frag>;
+ }
+ defm : MUBUFScratchLoadPat_D16_Common_t16<Instr # "_VBUFFER", vt, ld_frag>;
+}
+
+let OtherPredicates = [NotHasFlatScratchEnabled] in {
defm : MUBUFScratchLoadPat <"BUFFER_LOAD_SBYTE", i32, sextloadi8_private>;
defm : MUBUFScratchLoadPat <"BUFFER_LOAD_UBYTE", i32, extloadi8_private>;
defm : MUBUFScratchLoadPat <"BUFFER_LOAD_UBYTE", i32, zextloadi8_private>;
-defm : MUBUFScratchLoadPat <"BUFFER_LOAD_SBYTE", i16, sextloadi8_private>;
-defm : MUBUFScratchLoadPat <"BUFFER_LOAD_UBYTE", i16, extloadi8_private>;
-defm : MUBUFScratchLoadPat <"BUFFER_LOAD_UBYTE", i16, zextloadi8_private>;
defm : MUBUFScratchLoadPat <"BUFFER_LOAD_SSHORT", i32, sextloadi16_private>;
defm : MUBUFScratchLoadPat <"BUFFER_LOAD_USHORT", i32, extloadi16_private>;
defm : MUBUFScratchLoadPat <"BUFFER_LOAD_USHORT", i32, zextloadi16_private>;
+
+let True16Predicate = NotUseRealTrue16Insts in {
+defm : MUBUFScratchLoadPat <"BUFFER_LOAD_SBYTE", i16, sextloadi8_private>;
+defm : MUBUFScratchLoadPat <"BUFFER_LOAD_UBYTE", i16, extloadi8_private>;
+defm : MUBUFScratchLoadPat <"BUFFER_LOAD_UBYTE", i16, zextloadi8_private>;
defm : MUBUFScratchLoadPat <"BUFFER_LOAD_USHORT", i16, load_private>;
+}
+
+let True16Predicate = UseRealTrue16Insts in {
+defm : MUBUFScratchLoadPat_D16_t16 <"BUFFER_LOAD_SBYTE_D16_t16", i16, sextloadi8_private>;
+defm : MUBUFScratchLoadPat_D16_t16 <"BUFFER_LOAD_UBYTE_D16_t16", i16, extloadi8_private>;
+defm : MUBUFScratchLoadPat_D16_t16 <"BUFFER_LOAD_UBYTE_D16_t16", i16, zextloadi8_private>;
+defm : MUBUFScratchLoadPat_D16_t16 <"BUFFER_LOAD_SHORT_D16_t16", i16, load_private>;
+}
foreach vt = Reg32Types.types in {
defm : MUBUFScratchLoadPat <"BUFFER_LOAD_DWORD", vt, load_private>;
@@ -2027,7 +2161,7 @@ defm : MUBUFScratchLoadPat <"BUFFER_LOAD_DWORDX2", v2i32, load_private>;
defm : MUBUFScratchLoadPat <"BUFFER_LOAD_DWORDX3", v3i32, load_private>;
defm : MUBUFScratchLoadPat <"BUFFER_LOAD_DWORDX4", v4i32, load_private>;
-let OtherPredicates = [D16PreservesUnusedBits, DisableFlatScratch] in {
+let OtherPredicates = [D16PreservesUnusedBits, NotHasFlatScratchEnabled] in {
defm : MUBUFScratchLoadPat_D16<"BUFFER_LOAD_SHORT_D16_HI", v2i16, load_d16_hi_private>;
defm : MUBUFScratchLoadPat_D16<"BUFFER_LOAD_UBYTE_D16_HI", v2i16, az_extloadi8_d16_hi_private>;
defm : MUBUFScratchLoadPat_D16<"BUFFER_LOAD_SBYTE_D16_HI", v2i16, sextloadi8_d16_hi_private>;
@@ -2043,7 +2177,7 @@ defm : MUBUFScratchLoadPat_D16<"BUFFER_LOAD_UBYTE_D16", v2f16, az_extloadi8_d16_
defm : MUBUFScratchLoadPat_D16<"BUFFER_LOAD_SBYTE_D16", v2f16, sextloadi8_d16_lo_private>;
}
-} // End OtherPredicates = [DisableFlatScratch]
+} // End OtherPredicates = [NotHasFlatScratchEnabled]
multiclass MUBUFStore_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET,
ValueType vt, PatFrag atomic_st> {
@@ -2084,8 +2218,15 @@ multiclass MUBUFStore_PatternOffset <string Instr, ValueType vt,
defm : MUBUFStore_PatternOffset_Common<Instr # "_VBUFFER", vt, st>;
}
+let True16Predicate = NotUseRealTrue16Insts in {
defm : MUBUFStore_PatternOffset <"BUFFER_STORE_BYTE", i16, truncstorei8_global>;
defm : MUBUFStore_PatternOffset <"BUFFER_STORE_SHORT", i16, store_global>;
+}
+
+let True16Predicate = UseRealTrue16Insts in {
+defm : MUBUFStore_PatternOffset <"BUFFER_STORE_BYTE_t16", i16, truncstorei8_global>;
+defm : MUBUFStore_PatternOffset <"BUFFER_STORE_SHORT_t16", i16, store_global>;
+}
multiclass MUBUFScratchStorePat_Common <string Instr,
ValueType vt, PatFrag st,
@@ -2112,11 +2253,19 @@ multiclass MUBUFScratchStorePat <string Instr,
defm : MUBUFScratchStorePat_Common<Instr # "_VBUFFER", vt, st, rc>;
}
-let OtherPredicates = [DisableFlatScratch] in {
+let OtherPredicates = [NotHasFlatScratchEnabled] in {
defm : MUBUFScratchStorePat <"BUFFER_STORE_BYTE", i32, truncstorei8_private>;
defm : MUBUFScratchStorePat <"BUFFER_STORE_SHORT", i32, truncstorei16_private>;
+
+let True16Predicate = NotUseRealTrue16Insts in {
defm : MUBUFScratchStorePat <"BUFFER_STORE_BYTE", i16, truncstorei8_private>;
defm : MUBUFScratchStorePat <"BUFFER_STORE_SHORT", i16, store_private>;
+}
+
+let True16Predicate = UseRealTrue16Insts in {
+defm : MUBUFScratchStorePat <"BUFFER_STORE_BYTE_t16", i16, truncstorei8_private, VGPR_16>;
+defm : MUBUFScratchStorePat <"BUFFER_STORE_SHORT_t16", i16, store_private, VGPR_16>;
+}
foreach vt = Reg32Types.types in {
defm : MUBUFScratchStorePat <"BUFFER_STORE_DWORD", vt, store_private>;
@@ -2127,7 +2276,7 @@ defm : MUBUFScratchStorePat <"BUFFER_STORE_DWORDX3", v3i32, store_private, VReg_
defm : MUBUFScratchStorePat <"BUFFER_STORE_DWORDX4", v4i32, store_private, VReg_128>;
-let OtherPredicates = [HasD16LoadStore, DisableFlatScratch] in {
+let OtherPredicates = [HasD16LoadStore, NotHasFlatScratchEnabled] in {
// Hiding the extract high pattern in the PatFrag seems to not
// automatically increase the complexity.
let AddedComplexity = 1 in {
@@ -2135,7 +2284,7 @@ defm : MUBUFScratchStorePat <"BUFFER_STORE_SHORT_D16_HI", i32, store_hi16_privat
defm : MUBUFScratchStorePat <"BUFFER_STORE_BYTE_D16_HI", i32, truncstorei8_hi16_private>;
}
}
-} // End OtherPredicates = [DisableFlatScratch]
+} // End OtherPredicates = [NotHasFlatScratchEnabled]
//===----------------------------------------------------------------------===//
// MTBUF Patterns
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 13f727b68..ae684a5 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -15,6 +15,7 @@ tablegen(LLVM AMDGPUGenMCCodeEmitter.inc -gen-emitter)
tablegen(LLVM AMDGPUGenMCPseudoLowering.inc -gen-pseudo-lowering)
tablegen(LLVM AMDGPUGenRegisterBank.inc -gen-register-bank)
tablegen(LLVM AMDGPUGenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM AMDGPUGenSDNodeInfo.inc -gen-sd-node-info)
tablegen(LLVM AMDGPUGenSearchableTables.inc -gen-searchable-tables)
tablegen(LLVM AMDGPUGenSubtargetInfo.inc -gen-subtarget)
@@ -39,10 +40,6 @@ tablegen(LLVM R600GenSubtargetInfo.inc -gen-subtarget)
add_public_tablegen_target(AMDGPUCommonTableGen)
-set(LLVM_TARGET_DEFINITIONS InstCombineTables.td)
-tablegen(LLVM InstCombineTables.inc -gen-searchable-tables)
-add_public_tablegen_target(InstCombineTableGen)
-
add_llvm_target(AMDGPUCodeGen
AMDGPUAliasAnalysis.cpp
AMDGPUAlwaysInlinePass.cpp
@@ -52,6 +49,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUAsmPrinter.cpp
AMDGPUAtomicOptimizer.cpp
AMDGPUAttributor.cpp
+ AMDGPUBarrierLatency.cpp
AMDGPUCallLowering.cpp
AMDGPUCodeGenPrepare.cpp
AMDGPUCombinerHelper.cpp
@@ -61,6 +59,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUFrameLowering.cpp
AMDGPUGlobalISelDivergenceLowering.cpp
AMDGPUGlobalISelUtils.cpp
+ AMDGPUHazardLatency.cpp
AMDGPUHSAMetadataStreamer.cpp
AMDGPUInsertDelayAlu.cpp
AMDGPUInstCombineIntrinsic.cpp
@@ -80,6 +79,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPULowerKernelAttributes.cpp
AMDGPULowerModuleLDSPass.cpp
AMDGPUPrepareAGPRAlloc.cpp
+ AMDGPULowerExecSync.cpp
AMDGPUSwLowerLDS.cpp
AMDGPUMachineFunction.cpp
AMDGPUMachineModuleInfo.cpp
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index d0ad120..d8a8450 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -128,7 +128,7 @@ class DS_0A1D_NORET<string opName, RegisterOperand rc = AVLdSt_32>
class DS_1A1D_NORET<string opName, RegisterOperand rc = AVLdSt_32>
: DS_Pseudo<opName,
(outs),
- (ins VGPR_32:$addr, rc:$data0, Offset:$offset, gds:$gds),
+ (ins VGPROp_32:$addr, rc:$data0, Offset:$offset, gds:$gds),
" $addr, $data0$offset$gds"> {
let has_data1 = 0;
@@ -163,7 +163,7 @@ multiclass DS_1A1D_NORET_mc_gfx9<string opName, RegisterOperand rc = AVLdSt_32>
class DS_1A2D_NORET<string opName, RegisterOperand data_op = VGPROp_32>
: DS_Pseudo<opName,
(outs),
- (ins VGPR_32:$addr, data_op:$data0, data_op:$data1, Offset:$offset, gds:$gds),
+ (ins VGPROp_32:$addr, data_op:$data0, data_op:$data1, Offset:$offset, gds:$gds),
" $addr, $data0, $data1$offset$gds"> {
let has_vdst = 0;
@@ -190,7 +190,7 @@ multiclass DS_1A2D_NORET_mc<string opName, RegisterOperand rc = VGPROp_32> {
class DS_1A2D_Off8_NORET <string opName, RegisterOperand data_op = VGPROp_32>
: DS_Pseudo<opName,
(outs),
- (ins VGPR_32:$addr, data_op:$data0, data_op:$data1,
+ (ins VGPROp_32:$addr, data_op:$data0, data_op:$data1,
Offset0:$offset0, Offset1:$offset1, gds:$gds),
" $addr, $data0, $data1$offset0$offset1$gds"> {
@@ -230,7 +230,7 @@ class DS_0A1D_RET_GDS<string opName, RegisterOperand dst_op = AVLdSt_32,
class DS_1A1D_RET <string opName, RegisterOperand data_op = AVLdSt_32>
: DS_Pseudo<opName,
(outs data_op:$vdst),
- (ins VGPR_32:$addr, data_op:$data0, Offset:$offset, gds:$gds),
+ (ins VGPROp_32:$addr, data_op:$data0, Offset:$offset, gds:$gds),
" $vdst, $addr, $data0$offset$gds"> {
let has_data1 = 0;
@@ -260,7 +260,7 @@ class DS_1A2D_RET<string opName,
RegisterOperand dst_rc = VGPROp_32,
RegisterOperand src_rc = dst_rc>: DS_Pseudo<opName,
(outs dst_rc:$vdst),
- (ins VGPR_32:$addr, src_rc:$data0, src_rc:$data1, Offset:$offset, gds:$gds),
+ (ins VGPROp_32:$addr, src_rc:$data0, src_rc:$data1, Offset:$offset, gds:$gds),
" $vdst, $addr, $data0, $data1$offset$gds"> {
let IsAtomicRet = 1;
@@ -286,7 +286,7 @@ class DS_1A2D_Off8_RET<string opName,
RegisterOperand src_rc = dst_rc>
: DS_Pseudo<opName,
(outs dst_rc:$vdst),
- (ins VGPR_32:$addr, src_rc:$data0, src_rc:$data1, Offset0:$offset0, Offset1:$offset1, gds:$gds),
+ (ins VGPROp_32:$addr, src_rc:$data0, src_rc:$data1, Offset0:$offset0, Offset1:$offset1, gds:$gds),
" $vdst, $addr, $data0, $data1$offset0$offset1$gds"> {
let has_offset = 0;
@@ -311,8 +311,8 @@ class DS_BVH_STACK<string opName,
RegisterOperand vdst_rc,
RegisterOperand data1_rc>
: DS_Pseudo<opName,
- (outs vdst_rc:$vdst, VGPR_32:$addr),
- (ins VGPR_32:$addr_in, VGPR_32:$data0, data1_rc:$data1, Offset:$offset),
+ (outs vdst_rc:$vdst, VGPROp_32:$addr),
+ (ins VGPROp_32:$addr_in, VGPROp_32:$data0, data1_rc:$data1, Offset:$offset),
" $vdst, $addr, $data0, $data1$offset"> {
let Constraints = "$addr = $addr_in";
let has_gds = 0;
@@ -327,8 +327,8 @@ class DS_1A_RET<string opName, RegisterOperand data_op = AVLdSt_32,
: DS_Pseudo<opName,
(outs data_op:$vdst),
!if(HasTiedOutput,
- (ins VGPR_32:$addr, ofs:$offset, gds:$gds, data_op:$vdst_in),
- (ins VGPR_32:$addr, ofs:$offset, gds:$gds)),
+ (ins VGPROp_32:$addr, ofs:$offset, gds:$gds, data_op:$vdst_in),
+ (ins VGPROp_32:$addr, ofs:$offset, gds:$gds)),
" $vdst, $addr$offset$gds"> {
let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", "");
let has_data0 = 0;
@@ -366,7 +366,7 @@ class DS_1A_RET_Tied<string opName, RegisterOperand rc = AVLdSt_32> :
class DS_1A_Off8_RET <string opName, RegisterOperand rc = AVLdSt_32>
: DS_Pseudo<opName,
(outs rc:$vdst),
- (ins VGPR_32:$addr, Offset0:$offset0, Offset1:$offset1, gds:$gds),
+ (ins VGPROp_32:$addr, Offset0:$offset0, Offset1:$offset1, gds:$gds),
" $vdst, $addr$offset0$offset1$gds"> {
let has_offset = 0;
@@ -384,7 +384,7 @@ multiclass DS_1A_Off8_RET_mc <string opName, RegisterOperand rc = VGPROp_32> {
class DS_1A_RET_GDS <string opName> : DS_Pseudo<opName,
(outs AVLdSt_32:$vdst),
- (ins VGPR_32:$addr, Offset:$offset),
+ (ins VGPROp_32:$addr, Offset:$offset),
" $vdst, $addr$offset gds"> {
let has_data0 = 0;
@@ -396,7 +396,7 @@ class DS_1A_RET_GDS <string opName> : DS_Pseudo<opName,
class DS_1A_Off16_NORET <string opName>
: DS_Pseudo<opName,
(outs),
- (ins VGPR_32:$addr, Offset:$offset, gds:$gds),
+ (ins VGPROp_32:$addr, Offset:$offset, gds:$gds),
" $addr$offset$gds"> {
let has_vdst = 0;
@@ -422,7 +422,7 @@ class DS_0A_RET <string opName> : DS_Pseudo<opName,
class DS_1A <string opName> : DS_Pseudo<opName,
(outs),
- (ins VGPR_32:$addr, Offset:$offset, gds:$gds),
+ (ins VGPROp_32:$addr, Offset:$offset, gds:$gds),
" $addr$offset$gds"> {
let mayLoad = 1;
@@ -463,7 +463,7 @@ class DS_GWS_0D <string opName>
class DS_GWS_1D <string opName>
: DS_GWS<opName,
- (ins AVLdSt_32:$data0, Offset:$offset),
+ (ins AV_LdSt_32_Align2_RegOp:$data0, Offset:$offset),
" $data0$offset gds"> {
let has_gws_data0 = 1;
@@ -491,7 +491,7 @@ class DS_1A1D_PERMUTE <string opName, SDPatternOperator node = null_frag,
RegisterOperand data_op = AVLdSt_32>
: DS_Pseudo<opName,
(outs data_op:$vdst),
- (ins VGPR_32:$addr, data_op:$data0, Offset:$offset),
+ (ins VGPROp_32:$addr, data_op:$data0, Offset:$offset),
" $vdst, $addr, $data0$offset",
[(set i32:$vdst,
(node (DS1Addr1Offset i32:$addr, i32:$offset), i32:$data0))] > {
@@ -886,17 +886,6 @@ defm DS_SUB_CLAMP_RTN_U32 : DS_1A1D_RET_mc_gfx9<"ds_sub_clamp_rtn_u32", VGPROp_3
def DS_BPERMUTE_FI_B32 : DS_1A1D_PERMUTE <"ds_bpermute_fi_b32",
int_amdgcn_ds_bpermute_fi_b32>;
-multiclass DSAtomicRetNoRetPatIntrinsic_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
- ValueType vt, string frag> {
- def : DSAtomicRetPat<inst, vt,
- !cast<PatFrag>(frag#"_local_addrspace")>;
-
- let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
- def : DSAtomicRetPat<noRetInst, vt,
- !cast<PatFrag>(frag#"_noret_local_addrspace"), /* complexity */ 1>;
-}
-
-defm : DSAtomicRetNoRetPatIntrinsic_mc<DS_COND_SUB_RTN_U32, DS_COND_SUB_U32, i32, "int_amdgcn_atomic_cond_sub_u32">;
} // let SubtargetPredicate = isGFX12Plus
let SubtargetPredicate = isGFX1250Plus in {
@@ -917,7 +906,7 @@ def DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 : DS_1A_Off16_NORET<"ds_atomic_async_barr
def : GCNPat <
(int_amdgcn_ds_atomic_async_barrier_arrive_b64 (DS1Addr1Offset i32:$ptr, i32:$offset)),
- (DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 VGPR_32:$ptr, Offset:$offset, (i1 0))
+ (DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 VGPROp_32:$ptr, Offset:$offset, (i1 0))
>;
defm DS_ATOMIC_BARRIER_ARRIVE_RTN_B64 : DS_1A1D_RET_mc_gfx9<"ds_atomic_barrier_arrive_rtn_b64", VGPROp_64>;
@@ -943,7 +932,7 @@ let WaveSizePredicate = isWave64, SubtargetPredicate = HasGFX950Insts, mayStore
def : GCNPat <
(int_amdgcn_ds_swizzle i32:$src, timm:$offset16),
- (DS_SWIZZLE_B32 VGPR_32:$src, (as_i16timm $offset16), (i1 0))
+ (DS_SWIZZLE_B32 VGPROp_32:$src, (as_i16timm $offset16), (i1 0))
>;
class DSReadPat <DS_Pseudo inst, ValueType vt, PatFrag frag, int gds=0> : GCNPat <
@@ -1279,6 +1268,14 @@ defm : DSAtomicRetNoRetPat_NoM0_mc<DS_PK_ADD_RTN_F16, DS_PK_ADD_F16, v2f16, "ato
defm : DSAtomicRetNoRetPat_NoM0_mc<DS_PK_ADD_RTN_BF16, DS_PK_ADD_BF16, v2bf16, "atomic_load_fadd">;
}
+let SubtargetPredicate = HasAtomicDsCondSubClampInsts in {
+
+defm : DSAtomicRetNoRetPat_NoM0_mc<DS_COND_SUB_RTN_U32, DS_COND_SUB_U32, i32, "atomic_load_usub_cond">;
+
+defm : DSAtomicRetNoRetPat_NoM0_mc<DS_SUB_CLAMP_RTN_U32, DS_SUB_CLAMP_U32, i32, "atomic_load_usub_sat">;
+
+} // let SubtargetPredicate = HasAtomicDsCondSubClampInsts
+
let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in {
defm : DSAtomicCmpXChgSwapped_mc<DS_CMPST_RTN_B32, DS_CMPST_B32, i32, "atomic_cmp_swap">;
}
@@ -1339,28 +1336,28 @@ def : GCNPat <
def : GCNPat <
(i64 (int_amdgcn_ds_add_gs_reg_rtn i32:$src, timm:$offset32)),
- (DS_ADD_GS_REG_RTN VGPR_32:$src, (as_i32timm $offset32))
+ (DS_ADD_GS_REG_RTN VGPROp_32:$src, (as_i32timm $offset32))
>;
def : GCNPat <
(i32 (int_amdgcn_ds_add_gs_reg_rtn i32:$src, timm:$offset32)),
(EXTRACT_SUBREG
(i64 (COPY_TO_REGCLASS
- (DS_ADD_GS_REG_RTN VGPR_32:$src, (as_i32timm $offset32)),
+ (DS_ADD_GS_REG_RTN VGPROp_32:$src, (as_i32timm $offset32)),
VReg_64)),
sub0)
>;
def : GCNPat <
(i64 (int_amdgcn_ds_sub_gs_reg_rtn i32:$src, timm:$offset32)),
- (DS_SUB_GS_REG_RTN VGPR_32:$src, (as_i32timm $offset32))
+ (DS_SUB_GS_REG_RTN VGPROp_32:$src, (as_i32timm $offset32))
>;
def : GCNPat <
(i32 (int_amdgcn_ds_sub_gs_reg_rtn i32:$src, timm:$offset32)),
(EXTRACT_SUBREG
(i64 (COPY_TO_REGCLASS
- (DS_SUB_GS_REG_RTN VGPR_32:$src, (as_i32timm $offset32)),
+ (DS_SUB_GS_REG_RTN VGPROp_32:$src, (as_i32timm $offset32)),
VReg_64)),
sub0)
>;
@@ -1488,6 +1485,12 @@ let AssemblerPredicate = isGFX12Plus in {
def : MnemonicAlias<"ds_load_tr_b64", "ds_load_tr8_b64">, Requires<[isGFX1250Plus]>;
def : MnemonicAlias<"ds_load_tr_b128", "ds_load_tr16_b128">, Requires<[isGFX1250Plus]>;
+// Additional aliases for ds load transpose instructions.
+def : MnemonicAlias<"ds_load_b64_tr_b8", "ds_load_tr8_b64">, Requires<[isGFX125xOnly]>;
+def : MnemonicAlias<"ds_load_b128_tr_b16", "ds_load_tr16_b128">, Requires<[isGFX125xOnly]>;
+def : MnemonicAlias<"ds_load_b64_tr_b4", "ds_load_tr4_b64">, Requires<[isGFX125xOnly]>;
+def : MnemonicAlias<"ds_load_b96_tr_b6", "ds_load_tr6_b96">, Requires<[isGFX125xOnly]>;
+
//===----------------------------------------------------------------------===//
// GFX11.
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index e0375ea..b2dfd09 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -528,12 +528,26 @@ void AMDGPUDisassembler::decodeImmOperands(MCInst &MI,
break;
case AMDGPU::OPERAND_REG_IMM_FP16:
case AMDGPU::OPERAND_REG_IMM_INT16:
- case AMDGPU::OPERAND_REG_IMM_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_C_FP16:
case AMDGPU::OPERAND_REG_INLINE_C_INT16:
+ Imm = getInlineImmValF16(Imm);
+ break;
+ case AMDGPU::OPERAND_REG_IMM_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
Imm = getInlineImmValF16(Imm);
break;
+ case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT: {
+ // V_PK_FMAC_F16 on GFX11+ duplicates the f16 inline constant to both
+ // halves, so we need to produce the duplicated value for correct
+ // round-trip.
+ if (isGFX11Plus()) {
+ int64_t F16Val = getInlineImmValF16(Imm);
+ Imm = (F16Val << 16) | (F16Val & 0xFFFF);
+ } else {
+ Imm = getInlineImmValF16(Imm);
+ }
+ break;
+ }
case AMDGPU::OPERAND_REG_IMM_FP64:
case AMDGPU::OPERAND_REG_IMM_INT64:
case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
@@ -566,7 +580,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
// Try to decode DPP and SDWA first to solve conflict with VOP1 and VOP2
// encodings
- if (isGFX1250() && Bytes.size() >= 16) {
+ if (isGFX1250Plus() && Bytes.size() >= 16) {
std::bitset<128> DecW = eat16Bytes(Bytes);
if (tryDecodeInst(DecoderTableGFX1250128, MI, DecW, Address, CS))
break;
@@ -595,6 +609,11 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
tryDecodeInst(DecoderTableGFX12W6496, MI, DecW, Address, CS))
break;
+ if (isGFX13() &&
+ tryDecodeInst(DecoderTableGFX1396, DecoderTableGFX13_FAKE1696, MI,
+ DecW, Address, CS))
+ break;
+
if (STI.hasFeature(AMDGPU::Feature64BitLiterals)) {
// Return 8 bytes for a potential literal.
Bytes = Bytes_.slice(4, MaxInstBytesNum - 4);
@@ -680,6 +699,11 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
tryDecodeInst(DecoderTableGFX12W6464, MI, QW, Address, CS))
break;
+ if (isGFX13() &&
+ tryDecodeInst(DecoderTableGFX1364, DecoderTableGFX13_FAKE1664, MI, QW,
+ Address, CS))
+ break;
+
// Reinitialize Bytes
Bytes = Bytes_.slice(0, MaxInstBytesNum);
}
@@ -727,6 +751,11 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
tryDecodeInst(DecoderTableGFX1232, DecoderTableGFX12_FAKE1632, MI, DW,
Address, CS))
break;
+
+ if (isGFX13() &&
+ tryDecodeInst(DecoderTableGFX1332, DecoderTableGFX13_FAKE1632, MI, DW,
+ Address, CS))
+ break;
}
return MCDisassembler::Fail;
@@ -892,6 +921,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
// have EXEC as implicit destination. Issue a warning if encoding for
// vdst is not EXEC.
if ((MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3) &&
+ MCII->get(MI.getOpcode()).getNumDefs() == 0 &&
MCII->get(MI.getOpcode()).hasImplicitDefOfPhysReg(AMDGPU::EXEC)) {
auto ExecEncoding = MRI.getEncodingValue(AMDGPU::EXEC_LO);
if (Bytes_[0] != ExecEncoding)
@@ -1198,8 +1228,8 @@ void AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
// Given a wide tuple \p Reg check if it will overflow 256 registers.
// \returns \p Reg on success or NoRegister otherwise.
-static unsigned CheckVGPROverflow(unsigned Reg, const MCRegisterClass &RC,
- const MCRegisterInfo &MRI) {
+static MCRegister CheckVGPROverflow(MCRegister Reg, const MCRegisterClass &RC,
+ const MCRegisterInfo &MRI) {
unsigned NumRegs = RC.getSizeInBits() / 32;
MCRegister Sub0 = MRI.getSubReg(Reg, AMDGPU::sub0);
if (!Sub0)
@@ -1213,7 +1243,7 @@ static unsigned CheckVGPROverflow(unsigned Reg, const MCRegisterClass &RC,
assert(BaseReg && "Only vector registers expected");
- return (Sub0 - BaseReg + NumRegs <= 256) ? Reg : AMDGPU::NoRegister;
+ return (Sub0 - BaseReg + NumRegs <= 256) ? Reg : MCRegister();
}
// Note that before gfx10, the MIMG encoding provided no information about
@@ -1455,9 +1485,8 @@ MCOperand AMDGPUDisassembler::errOperand(unsigned V,
return MCOperand();
}
-inline
-MCOperand AMDGPUDisassembler::createRegOperand(unsigned int RegId) const {
- return MCOperand::createReg(AMDGPU::getMCReg(RegId, STI));
+inline MCOperand AMDGPUDisassembler::createRegOperand(MCRegister Reg) const {
+ return MCOperand::createReg(AMDGPU::getMCReg(Reg, STI));
}
inline
@@ -1597,6 +1626,9 @@ AMDGPUDisassembler::decodeLiteralConstant(const MCInstrDesc &Desc,
case AMDGPU::OPERAND_REG_IMM_V2FP16:
UseLit = AMDGPU::isInlinableLiteralV2F16(Val);
break;
+ case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT:
+ UseLit = AMDGPU::isPKFMACF16InlineConstant(Val, isGFX11Plus());
+ break;
case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
break;
case AMDGPU::OPERAND_REG_IMM_INT16:
@@ -2225,6 +2257,16 @@ bool AMDGPUDisassembler::isGFX12Plus() const {
bool AMDGPUDisassembler::isGFX1250() const { return AMDGPU::isGFX1250(STI); }
+bool AMDGPUDisassembler::isGFX1250Plus() const {
+ return AMDGPU::isGFX1250Plus(STI);
+}
+
+bool AMDGPUDisassembler::isGFX13() const { return AMDGPU::isGFX13(STI); }
+
+bool AMDGPUDisassembler::isGFX13Plus() const {
+ return AMDGPU::isGFX13Plus(STI);
+}
+
bool AMDGPUDisassembler::hasArchitectedFlatScratch() const {
return STI.hasFeature(AMDGPU::FeatureArchitectedFlatScratch);
}
@@ -2381,7 +2423,7 @@ Expected<bool> AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC1(
}
// Bits [27].
- if (isGFX1250()) {
+ if (isGFX1250Plus()) {
PRINT_PSEUDO_DIRECTIVE_COMMENT("FLAT_SCRATCH_IS_NV",
COMPUTE_PGM_RSRC1_GFX125_FLAT_SCRATCH_IS_NV);
} else {
@@ -2395,7 +2437,7 @@ Expected<bool> AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC1(
// Bits [29-31].
if (isGFX10Plus()) {
// WGP_MODE is not available on GFX1250.
- if (!isGFX1250()) {
+ if (!isGFX1250Plus()) {
PRINT_DIRECTIVE(".amdhsa_workgroup_processor_mode",
COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE);
}
@@ -2526,7 +2568,7 @@ Expected<bool> AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC3(
}
// Bits [14-21].
- if (isGFX1250()) {
+ if (isGFX1250Plus()) {
PRINT_DIRECTIVE(".amdhsa_named_barrier_count",
COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT);
PRINT_PSEUDO_DIRECTIVE_COMMENT(
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index d103d79..28f71d8 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -69,7 +69,7 @@ public:
const char* getRegClassName(unsigned RegClassID) const;
- MCOperand createRegOperand(unsigned int RegId) const;
+ MCOperand createRegOperand(MCRegister Reg) const;
MCOperand createRegOperand(unsigned RegClassID, unsigned Val) const;
MCOperand createSRegOperand(unsigned SRegClassID, unsigned Val) const;
MCOperand createVGPR16Operand(unsigned RegIdx, bool IsHi) const;
@@ -182,6 +182,9 @@ public:
bool isGFX12() const;
bool isGFX12Plus() const;
bool isGFX1250() const;
+ bool isGFX1250Plus() const;
+ bool isGFX13() const;
+ bool isGFX13Plus() const;
bool hasArchitectedFlatScratch() const;
bool hasKernargPreload() const;
diff --git a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
index dadc7dc..a2e3ece 100644
--- a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -505,7 +505,6 @@ def : AMDGPUPat <
(fshr i32:$src0, i32:$src1, i32:$src2),
(BIT_ALIGN_INT_eg $src0, $src1, $src2)
>;
-def : ROTRPattern <BIT_ALIGN_INT_eg>;
def MULADD_eg : MULADD_Common<0x14>;
def MULADD_IEEE_eg : MULADD_IEEE_Common<0x18>;
def FMA_eg : FMA_Common<0x7>;
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 6de59be..63460b5 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -229,13 +229,13 @@ class GlobalSaddrTable <bit is_saddr, string Name = ""> {
class FLAT_Load_Pseudo<
string opName, RegisterOperand vdata_op, bit HasTiedOutput = 0,
bit HasSaddr = 0, bit EnableSaddr = 0,
- RegisterClassLike VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64_AlignTarget)>
+ RegisterOperand VaddrOp = !if(EnableSaddr, VGPROp_32, VGPROp_64)>
: FLAT_Pseudo<opName, (outs), (ins), ""> {
let OutOperandList = (outs vdata_op:$vdst);
let InOperandList = !con(
!if(EnableSaddr, (ins SReg_64_XEXEC_XNULL:$saddr), (ins)),
- (ins VaddrRC:$vaddr, flat_offset:$offset),
+ (ins VaddrOp:$vaddr, flat_offset:$offset),
// FIXME: Operands with default values do not work with following
// non-optional operands.
!if(HasTiedOutput, (ins CPol:$cpol, vdata_op:$vdst_in),
@@ -262,15 +262,25 @@ multiclass FLAT_Flat_Load_Pseudo<string opName, RegisterOperand regClass = AVLdS
multiclass FLAT_Flat_Load_Pseudo_t16<string opName> {
defm "" : FLAT_Flat_Load_Pseudo<opName, AVLdSt_32, 1>;
- let True16Predicate = UseRealTrue16Insts in
- defm _t16 : FLAT_Flat_Load_Pseudo<opName#"_t16", VGPROp_16>, True16D16Table<NAME#"_HI", NAME>;
+
+ defvar Name16 = opName#"_t16";
+ let True16Predicate = UseRealTrue16Insts in {
+ def _t16 : FLAT_Load_Pseudo<Name16, VGPROp_16>,
+ GlobalSaddrTable<0, Name16>,
+ True16D16Table<NAME#"_HI", NAME>;
+
+ let OtherPredicates = [HasFlatGVSMode] in
+ def _t16_SADDR : FLAT_Load_Pseudo<Name16, VGPROp_16, 0, 1, 1>,
+ GlobalSaddrTable<1, Name16>,
+ True16D16Table<NAME#"_HI_SADDR", NAME#"_SADDR">;
+ }
}
class FLAT_Store_Pseudo <string opName, RegisterOperand vdataClass,
bit HasSaddr = 0, bit EnableSaddr = 0,
- RegisterClassLike VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64_AlignTarget)> : FLAT_Pseudo<opName, (outs), (ins), ""> {
+ RegisterOperand VaddrOp = !if(EnableSaddr, VGPROp_32, VGPROp_64)> : FLAT_Pseudo<opName, (outs), (ins), ""> {
let InOperandList = !con(
- (ins VaddrRC:$vaddr, vdataClass:$vdata),
+ (ins VaddrOp:$vaddr, vdataClass:$vdata),
!if(EnableSaddr, (ins SReg_64_XEXEC_XNULL:$saddr), (ins)),
(ins flat_offset:$offset, CPol_0:$cpol));
let AsmOperands = " $vaddr, $vdata"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$cpol";
@@ -380,15 +390,16 @@ multiclass FLAT_Global_Store_Pseudo_t16<string opName> {
// Async loads, introduced in gfx1250, will store directly
// to a DS address in vdst (they will not use M0 for DS addess).
-class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0, bit IsAsync = 0> : FLAT_Pseudo<
+class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0, bit IsAsync = 0, bit IsLegacyLDSDMA = 0> : FLAT_Pseudo<
opName,
(outs ),
!con(
- !if(IsAsync, (ins VGPR_32:$vdst), (ins)),
- !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64_AlignTarget:$vaddr)),
- (ins flat_offset:$offset, CPol_0:$cpol)),
+ !if(IsAsync, (ins VGPROp_32:$vdst), (ins)),
+ !if(EnableSaddr, (ins SReg_64:$saddr, VGPROp_32:$vaddr), (ins VGPROp_64:$vaddr)),
+ (ins flat_offset:$offset, CPol_0:$cpol),
+ !if(IsLegacyLDSDMA, (ins i1imm_0:$IsAsync), (ins))),
!if(IsAsync, " $vdst,", "")#" $vaddr"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> {
- let LGKM_CNT = !not(IsAsync);
+ let LGKM_CNT = 0;
let VM_CNT = !not(IsAsync);
let ASYNC_CNT = IsAsync;
let is_flat_global = 1;
@@ -406,10 +417,10 @@ class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0, bit IsAsy
let SchedRW = [WriteVMEM, WriteLDS];
}
-multiclass FLAT_Global_Load_LDS_Pseudo<string opName, bit IsAsync = 0> {
- def "" : FLAT_Global_Load_LDS_Pseudo<opName, 0, IsAsync>,
+multiclass FLAT_Global_Load_LDS_Pseudo<string opName, bit IsAsync = 0, bit IsLegacyLDSDMA = 0> {
+ def "" : FLAT_Global_Load_LDS_Pseudo<opName, 0, IsAsync, IsLegacyLDSDMA>,
GlobalSaddrTable<0, opName>;
- def _SADDR : FLAT_Global_Load_LDS_Pseudo<opName, 1, IsAsync>,
+ def _SADDR : FLAT_Global_Load_LDS_Pseudo<opName, 1, IsAsync, IsLegacyLDSDMA>,
GlobalSaddrTable<1, opName>;
}
@@ -417,7 +428,7 @@ class FLAT_Global_STORE_LDS_Pseudo <string opName, bit EnableSaddr = 0> : FLAT_P
opName,
(outs ),
!con(
- !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64_AlignTarget:$vaddr)), (ins VGPR_32:$vdata),
+ !if(EnableSaddr, (ins SReg_64:$saddr, VGPROp_32:$vaddr), (ins VGPROp_64:$vaddr)), (ins VGPROp_32:$vdata),
(ins flat_offset:$offset, CPol_0:$cpol)),
" $vaddr, $vdata"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> {
let VM_CNT = 0;
@@ -511,7 +522,7 @@ class FLAT_Global_Invalidate_Writeback<string opName, SDPatternOperator node = n
let sve = 0;
}
-class FLAT_Prefetch_Pseudo<string opName, dag addr = (ins VReg_64_AlignTarget:$vaddr), string asm = " $vaddr"> :
+class FLAT_Prefetch_Pseudo<string opName, dag addr = (ins VGPROp_64:$vaddr), string asm = " $vaddr"> :
FLAT_Pseudo<opName, (outs), !con(addr, (ins flat_offset:$offset, CPol_0:$cpol)), asm#"$offset$cpol"> {
let has_vdst = 0;
let has_data = 0;
@@ -524,7 +535,7 @@ class FLAT_Prefetch_Pseudo<string opName, dag addr = (ins VReg_64_AlignTarget:$v
multiclass FLAT_Flat_Prefetch_Pseudo<string opName> {
def "" : FLAT_Prefetch_Pseudo<opName>,
GlobalSaddrTable<0, opName>;
- def _SADDR : FLAT_Prefetch_Pseudo<opName, (ins SReg_64:$saddr, VGPR_32:$vaddr), " $vaddr, $saddr">,
+ def _SADDR : FLAT_Prefetch_Pseudo<opName, (ins SReg_64:$saddr, VGPROp_32:$vaddr), " $vaddr, $saddr">,
GlobalSaddrTable<1, opName> {
let OtherPredicates = [HasFlatGVSMode];
let enabled_saddr = 1;
@@ -533,9 +544,9 @@ multiclass FLAT_Flat_Prefetch_Pseudo<string opName> {
multiclass FLAT_Global_Prefetch_Pseudo<string opName> {
let is_flat_global = 1, has_saddr = 1 in {
- def "" : FLAT_Prefetch_Pseudo<opName, (ins VReg_64_AlignTarget:$vaddr), " $vaddr, off">,
+ def "" : FLAT_Prefetch_Pseudo<opName, (ins VGPROp_64:$vaddr), " $vaddr, off">,
GlobalSaddrTable<0, opName>;
- def _SADDR : FLAT_Prefetch_Pseudo<opName, (ins SReg_64:$saddr, VGPR_32:$vaddr), " $vaddr, $saddr">,
+ def _SADDR : FLAT_Prefetch_Pseudo<opName, (ins SReg_64:$saddr, VGPROp_32:$vaddr), " $vaddr, $saddr">,
GlobalSaddrTable<1, opName> {
let enabled_saddr = 1;
}
@@ -557,11 +568,11 @@ class FLAT_Scratch_Load_Pseudo <string opName, RegisterOperand regClass = AVLdSt
(outs regClass:$vdst),
!con(
!if(EnableSVE,
- (ins VGPR_32:$vaddr, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset),
+ (ins VGPROp_32:$vaddr, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset),
!if(EnableSaddr,
(ins SReg_32_XEXEC_HI:$saddr, flat_offset:$offset),
!if(EnableVaddr,
- (ins VGPR_32:$vaddr, flat_offset:$offset),
+ (ins VGPROp_32:$vaddr, flat_offset:$offset),
(ins flat_offset:$offset)))),
!if(HasTiedOutput, (ins CPol:$cpol, regClass:$vdst_in),
(ins CPol_0:$cpol))),
@@ -584,11 +595,11 @@ class FLAT_Scratch_Store_Pseudo <string opName, RegisterOperand vdata_op, bit En
opName,
(outs),
!if(EnableSVE,
- (ins vdata_op:$vdata, VGPR_32:$vaddr, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, CPol_0:$cpol),
+ (ins vdata_op:$vdata, VGPROp_32:$vaddr, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, CPol_0:$cpol),
!if(EnableSaddr,
(ins vdata_op:$vdata, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, CPol_0:$cpol),
!if(EnableVaddr,
- (ins vdata_op:$vdata, VGPR_32:$vaddr, flat_offset:$offset, CPol_0:$cpol),
+ (ins vdata_op:$vdata, VGPROp_32:$vaddr, flat_offset:$offset, CPol_0:$cpol),
(ins vdata_op:$vdata, flat_offset:$offset, CPol_0:$cpol)))),
" "#!if(EnableVaddr, "$vaddr", "off")#", $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$cpol"> {
let is_flat_scratch = 1;
@@ -687,11 +698,11 @@ class FLAT_Scratch_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0,
opName,
(outs ),
!if(EnableSVE,
- (ins VGPR_32:$vaddr, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, CPol:$cpol),
+ (ins VGPROp_32:$vaddr, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, CPol:$cpol),
!if(EnableSaddr,
(ins SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, CPol:$cpol),
!if(EnableVaddr,
- (ins VGPR_32:$vaddr, flat_offset:$offset, CPol:$cpol),
+ (ins VGPROp_32:$vaddr, flat_offset:$offset, CPol:$cpol),
(ins flat_offset:$offset, CPol:$cpol)))),
" "#!if(EnableVaddr, "$vaddr, ", "off, ")#!if(EnableSaddr, "$saddr", "off")#"$offset$cpol"> {
@@ -754,7 +765,7 @@ multiclass FLAT_Atomic_Pseudo_NO_RTN<
RegisterOperand data_op = vdst_op> {
def "" : FLAT_AtomicNoRet_Pseudo <opName,
(outs),
- (ins VReg_64_AlignTarget:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_0:$cpol),
+ (ins VGPROp_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_0:$cpol),
" $vaddr, $vdata$offset$cpol">,
GlobalSaddrTable<0, opName> {
let FPAtomic = data_vt.isFP;
@@ -763,7 +774,7 @@ multiclass FLAT_Atomic_Pseudo_NO_RTN<
def _SADDR : FLAT_AtomicNoRet_Pseudo <opName,
(outs),
- (ins VGPR_32:$vaddr, data_op:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_0:$cpol),
+ (ins VGPROp_32:$vaddr, data_op:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_0:$cpol),
" $vaddr, $vdata, $saddr$offset$cpol">,
GlobalSaddrTable<1, opName> {
let OtherPredicates = [HasFlatGVSMode];
@@ -786,7 +797,7 @@ multiclass FLAT_Atomic_Pseudo_RTN<
def _RTN : FLAT_AtomicRet_Pseudo <opName,
(outs vdst_op_vgpr:$vdst),
- (ins VReg_64_AlignTarget:$vaddr, data_op_vgpr:$vdata, flat_offset:$offset, CPol_GLC1:$cpol),
+ (ins VGPROp_64:$vaddr, data_op_vgpr:$vdata, flat_offset:$offset, CPol_GLC1:$cpol),
" $vdst, $vaddr, $vdata$offset$cpol">,
GlobalSaddrTable<0, opName#"_rtn"> {
let FPAtomic = data_vt.isFP;
@@ -795,7 +806,7 @@ multiclass FLAT_Atomic_Pseudo_RTN<
def _SADDR_RTN : FLAT_AtomicRet_Pseudo <opName,
(outs vdst_op_vgpr:$vdst),
- (ins VGPR_32:$vaddr, data_op_vgpr:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_GLC1:$cpol),
+ (ins VGPROp_32:$vaddr, data_op_vgpr:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_GLC1:$cpol),
" $vdst, $vaddr, $vdata, $saddr$offset$cpol">,
GlobalSaddrTable<1, opName#"_rtn"> {
let OtherPredicates = [HasFlatGVSMode];
@@ -811,7 +822,7 @@ multiclass FLAT_Atomic_Pseudo_RTN<
def _RTN_agpr : FLAT_AtomicRet_Pseudo <opName,
(outs vdst_op_agpr:$vdst),
- (ins VReg_64_AlignTarget:$vaddr, data_op_agpr:$vdata, flat_offset:$offset, CPol_GLC1:$cpol),
+ (ins VGPROp_64:$vaddr, data_op_agpr:$vdata, flat_offset:$offset, CPol_GLC1:$cpol),
" $vdst, $vaddr, $vdata$offset$cpol">,
GlobalSaddrTable<0, opName#"_rtn_agpr"> {
let FPAtomic = data_vt.isFP;
@@ -837,10 +848,10 @@ class FLAT_Global_Atomic_Pseudo_NO_RTN<
ValueType data_vt = vt,
RegisterOperand data_op = vdst_op,
bit EnableSaddr = false,
- RegisterClassLike VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64_AlignTarget)>
+ RegisterOperand VaddrOp = !if(EnableSaddr, VGPROp_32, VGPROp_64)>
: FLAT_AtomicNoRet_Pseudo<opName, (outs), (ins), "">, GlobalSaddrTable<EnableSaddr, opName> {
let InOperandList = !con(
- (ins VaddrRC:$vaddr, data_op:$vdata),
+ (ins VaddrOp:$vaddr, data_op:$vdata),
!if(EnableSaddr, (ins SReg_64_XEXEC_XNULL:$saddr), (ins)),
(ins flat_offset:$offset, CPol_0:$cpol));
let AsmOperands = " $vaddr, $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$cpol";
@@ -867,7 +878,7 @@ class FLAT_Global_Atomic_Pseudo_RTN<
RegisterOperand data_op = vdst_op,
bit EnableSaddr = false,
bit IsVGPR = false,
- RegisterClassLike VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64_AlignTarget)>
+ RegisterOperand VaddrOp = !if(EnableSaddr, VGPROp_32, VGPROp_64)>
: FLAT_AtomicRet_Pseudo<opName, (outs), (ins), "">, GlobalSaddrTable<EnableSaddr, opName#"_rtn"#!if(IsVGPR, "", "_agpr")> {
defvar vdst_rc= !if(IsVGPR, getEquivalentVGPROperand<vdst_op>.ret, getEquivalentAGPROperand<vdst_op>.ret);
@@ -875,7 +886,7 @@ class FLAT_Global_Atomic_Pseudo_RTN<
let OutOperandList = (outs vdst_rc:$vdst);
let InOperandList = !con(
- (ins VaddrRC:$vaddr, data_rc:$vdata),
+ (ins VaddrOp:$vaddr, data_rc:$vdata),
!if(EnableSaddr, (ins SReg_64_XEXEC_XNULL:$saddr), (ins)),
(ins flat_offset:$offset, CPol_GLC1:$cpol));
let AsmOperands = " $vdst, $vaddr, $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$cpol";
@@ -1202,15 +1213,15 @@ let SubtargetPredicate = HasGFX10_BEncoding in {
VGPROp_32, i32>;
}
-defm GLOBAL_LOAD_LDS_UBYTE : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_ubyte">;
-defm GLOBAL_LOAD_LDS_SBYTE : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_sbyte">;
-defm GLOBAL_LOAD_LDS_USHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_ushort">;
-defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_sshort">;
-defm GLOBAL_LOAD_LDS_DWORD : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dword">;
+defm GLOBAL_LOAD_LDS_UBYTE : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_ubyte", 0, 1>;
+defm GLOBAL_LOAD_LDS_SBYTE : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_sbyte", 0, 1>;
+defm GLOBAL_LOAD_LDS_USHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_ushort", 0, 1>;
+defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_sshort", 0, 1>;
+defm GLOBAL_LOAD_LDS_DWORD : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dword", 0, 1>;
let SubtargetPredicate = HasGFX950Insts in {
-defm GLOBAL_LOAD_LDS_DWORDX3 : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dwordx3">;
-defm GLOBAL_LOAD_LDS_DWORDX4 : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dwordx4">;
+defm GLOBAL_LOAD_LDS_DWORDX3 : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dwordx3", 0, 1>;
+defm GLOBAL_LOAD_LDS_DWORDX4 : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dwordx4", 0, 1>;
}
let SubtargetPredicate = isGFX12PlusNot12_50 in
@@ -1224,7 +1235,7 @@ let SubtargetPredicate = isGFX12Plus in {
def GLOBAL_WBINV : FLAT_Global_Invalidate_Writeback<"global_wbinv">;
} // End SubtargetPredicate = isGFX12Plus
-let SubtargetPredicate = isGFX1250Plus in {
+let SubtargetPredicate = HasMcastLoadInsts in {
let Uses = [M0, EXEC, ASYNCcnt], WaveSizePredicate = isWave32 in {
defm CLUSTER_LOAD_ASYNC_TO_LDS_B8 : FLAT_Global_Load_LDS_Pseudo<"cluster_load_async_to_lds_b8", 1>;
@@ -1243,7 +1254,7 @@ defm GLOBAL_STORE_ASYNC_FROM_LDS_B128 : FLAT_Global_STORE_LDS_Pseudo<"global_s
def TENSOR_SAVE : FLAT_Global_Tensor_Pseudo<"tensor_save", 1>;
def TENSOR_STOP : FLAT_Global_Tensor_Pseudo<"tensor_stop">;
-} // End SubtargetPredicate = isGFX1250Plus
+} // End SubtargetPredicate = HasMcastLoadInsts
defm SCRATCH_LOAD_UBYTE : FLAT_Scratch_Load_Pseudo <"scratch_load_ubyte">;
defm SCRATCH_LOAD_SBYTE : FLAT_Scratch_Load_Pseudo <"scratch_load_sbyte">;
@@ -1404,62 +1415,62 @@ class FlatSignedLoadPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType
>;
class GlobalLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), vt:$in)),
+ (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), i32:$offset, CPol:$cpol), vt:$in)),
(inst $saddr, $voffset, $offset, $cpol, $in)
>;
class FlatLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), vt:$in)),
+ (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), i32:$offset, CPol:$cpol), vt:$in)),
(inst $saddr, $voffset, $offset, $cpol, $in)
>;
class FlatLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))),
+ (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), i32:$offset, CPol:$cpol))),
(inst $saddr, $voffset, $offset, $cpol)
>;
class FlatLoadSaddrPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))),
+ (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), i32:$offset, CPol:$cpol))),
(EXTRACT_SUBREG (inst $saddr, $voffset, $offset, $cpol), lo16)
>;
class FlatLoadLDSSignedPat_M0 <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
- (node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol), M0),
+ (node (i64 VReg_64:$vaddr), (i32 VGPROp_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol), M0),
(inst $dsaddr, $vaddr, $offset, $cpol)
>;
class GlobalLoadLDSSaddrPat_M0 <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
- (node (GlobalSAddrNoIOffsetM0 (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), CPol:$cpol), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm), M0),
+ (node (GlobalSAddrNoIOffsetM0 (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), CPol:$cpol), (i32 VGPROp_32:$dsaddr), (i32 timm:$offset), (i32 timm), M0),
(inst $dsaddr, $saddr, $voffset, $offset, $cpol)
>;
class FlatLoadLDSSignedPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
- (node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol)),
+ (node (i64 VReg_64:$vaddr), (i32 VGPROp_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol)),
(inst $dsaddr, $vaddr, $offset, $cpol)
>;
class GlobalLoadLDSSaddrPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
- (node (GlobalSAddrNoIOffset (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), CPol:$cpol), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm)),
+ (node (GlobalSAddrNoIOffset (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), CPol:$cpol), (i32 VGPROp_32:$dsaddr), (i32 timm:$offset), (i32 timm)),
(inst $dsaddr, $saddr, $voffset, $offset, $cpol)
>;
class FlatStoreLDSSignedPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
- (node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol)),
+ (node (i64 VReg_64:$vaddr), (i32 VGPROp_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol)),
(inst $vaddr, $dsaddr, $offset, $cpol)
>;
class GlobalStoreLDSSaddrPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
- (node (GlobalSAddrNoIOffset (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), CPol:$cpol), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm)),
+ (node (GlobalSAddrNoIOffset (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), CPol:$cpol), (i32 VGPROp_32:$dsaddr), (i32 timm:$offset), (i32 timm)),
(inst $saddr, $voffset, $dsaddr, $offset, $cpol)
>;
class GlobalLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))),
+ (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), i32:$offset, CPol:$cpol))),
(inst $saddr, $voffset, $offset, $cpol)
>;
class GlobalLoadSaddrPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))),
+ (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), i32:$offset, CPol:$cpol))),
(EXTRACT_SUBREG (inst $saddr, $voffset, $offset, $cpol), lo16)
>;
@@ -1469,7 +1480,7 @@ class FlatLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt>
>;
class FlatLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))),
+ (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), i32:$offset, CPol:$cpol))),
(inst $saddr, $voffset, $offset, $cpol)
>;
@@ -1479,7 +1490,7 @@ class FlatLoadSignedPat_M0 <FLAT_Pseudo inst, SDPatternOperator node, ValueType
>;
class GlobalLoadSaddrPat_M0 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (GlobalSAddrCPolM0 (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), (i32 timm), M0)),
+ (vt (node (GlobalSAddrCPolM0 (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), i32:$offset, CPol:$cpol), (i32 timm), M0)),
(inst $saddr, $voffset, $offset, $cpol)
>;
@@ -1489,19 +1500,19 @@ class FlatLoadSignedPat_CPOL <FLAT_Pseudo inst, SDPatternOperator node, ValueTyp
>;
class GlobalLoadSaddrPat_CPOL <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (GlobalSAddrCPol (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), (i32 timm))),
+ (vt (node (GlobalSAddrCPol (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), i32:$offset, CPol:$cpol), (i32 timm))),
(inst $saddr, $voffset, $offset, $cpol)
>;
class FlatStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
ValueType vt> : GCNPat <
- (node vt:$data, (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol)),
+ (node vt:$data, (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), i32:$offset, CPol:$cpol)),
(inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset, $cpol)
>;
class FlatAtomicSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ComplexPattern pat,
ValueType vt, ValueType data_vt = vt> : GCNPat <
- (vt (node (pat (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), data_vt:$data)),
+ (vt (node (pat (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), i32:$offset, CPol:$cpol), data_vt:$data)),
(inst $voffset, getVregSrcForVT<data_vt>.ret:$data, $saddr, $offset, $cpol)> {
let SubtargetPredicate = inst.SubtargetPredicate;
let OtherPredicates = inst.OtherPredicates;
@@ -1509,7 +1520,7 @@ class FlatAtomicSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ComplexPatte
class GlobalAtomicNoRtnSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
ValueType vt> : GCNPat <
- (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), vt:$data),
+ (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), i32:$offset, CPol:$cpol), vt:$data),
(inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset, $cpol)
>;
@@ -1539,7 +1550,7 @@ multiclass FlatAtomicNoRtnPatBase <string base_inst_name, string node, ValueType
let AddedComplexity = 1 in
def : GCNPat <(vt (noRtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)),
- (inst VReg_64_AlignTarget:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> {
+ (inst VGPROp_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> {
let SubtargetPredicate = inst.SubtargetPredicate;
let OtherPredicates = inst.OtherPredicates;
}
@@ -1552,10 +1563,6 @@ multiclass FlatAtomicNoRtnPatBase <string base_inst_name, string node, ValueType
}
}
-multiclass FlatAtomicNoRtnPatWithAddrSpace<string inst, string node, string addrSpaceSuffix,
- ValueType vt> :
- FlatAtomicNoRtnPatBase<inst, node # "_noret_" # addrSpaceSuffix, vt, vt>;
-
multiclass FlatAtomicNoRtnPat <string inst, string node, ValueType vt,
ValueType data_vt = vt, bit isIntr = 0> :
FlatAtomicNoRtnPatBase<inst, node # "_noret" # !if(isIntr, "", "_"#vt), vt, data_vt>;
@@ -1568,7 +1575,7 @@ multiclass FlatAtomicRtnPatBase <string inst_name, string node, ValueType vt,
defvar rtnNode = !cast<SDPatternOperator>(node);
def : GCNPat <(vt (rtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)),
- (inst VReg_64_AlignTarget:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> {
+ (inst VGPROp_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> {
let SubtargetPredicate = inst.SubtargetPredicate;
let OtherPredicates = inst.OtherPredicates;
}
@@ -1580,10 +1587,6 @@ multiclass FlatAtomicRtnPatBase <string inst_name, string node, ValueType vt,
}
}
-multiclass FlatAtomicRtnPatWithAddrSpace<string inst, string intr, string addrSpaceSuffix,
- ValueType vt> :
- FlatAtomicRtnPatBase<inst, intr # "_" # addrSpaceSuffix, vt, vt>;
-
multiclass FlatAtomicRtnPat <string inst, string node, ValueType vt,
ValueType data_vt = vt, bit isIntr = 0> :
FlatAtomicRtnPatBase<inst, node # !if(isIntr, "", "_"#vt), vt, data_vt>;
@@ -1612,7 +1615,7 @@ multiclass FlatAtomicIntrPat <string inst, string node, ValueType vt,
class FlatSignedAtomicPatBase <FLAT_Pseudo inst, SDPatternOperator node,
ValueType vt, ValueType data_vt = vt> : GCNPat <
(vt (node (GlobalOffset i64:$vaddr, i32:$offset), data_vt:$data)),
- (inst VReg_64_AlignTarget:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> {
+ (inst VGPROp_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> {
let SubtargetPredicate = inst.SubtargetPredicate;
let OtherPredicates = inst.OtherPredicates;
}
@@ -1631,27 +1634,27 @@ multiclass FlatSignedAtomicPat <string inst, string node, ValueType vt,
}
class ScratchLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (ScratchOffset (i32 VGPR_32:$vaddr), i32:$offset))),
+ (vt (node (ScratchOffset (i32 VGPROp_32:$vaddr), i32:$offset))),
(inst $vaddr, $offset)
>;
class ScratchLoadSignedPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (node (ScratchOffset (i32 VGPR_32:$vaddr), i32:$offset), vt:$in),
+ (node (ScratchOffset (i32 VGPROp_32:$vaddr), i32:$offset), vt:$in),
(inst $vaddr, $offset, 0, $in)
>;
class ScratchLoadSignedPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (ScratchOffset (i32 VGPR_32:$vaddr), i32:$offset))),
+ (vt (node (ScratchOffset (i32 VGPROp_32:$vaddr), i32:$offset))),
(inst $vaddr, $offset, 0)
>;
class ScratchLoadSignedPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (ScratchOffset (i32 VGPR_32:$vaddr), i32:$offset))),
+ (vt (node (ScratchOffset (i32 VGPROp_32:$vaddr), i32:$offset))),
(EXTRACT_SUBREG (inst $vaddr, $offset), lo16)
>;
class ScratchStoreSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (node vt:$data, (ScratchOffset (i32 VGPR_32:$vaddr), i32:$offset)),
+ (node vt:$data, (ScratchOffset (i32 VGPROp_32:$vaddr), i32:$offset)),
(inst getVregSrcForVT<vt>.ret:$data, $vaddr, $offset)
>;
@@ -1682,28 +1685,28 @@ class ScratchStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
>;
class ScratchLoadSVaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol))),
+ (vt (node (ScratchSVAddr (i32 VGPROp_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol))),
(inst $vaddr, $saddr, $offset, $cpol)
>;
class ScratchStoreSVaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
ValueType vt> : GCNPat <
- (node vt:$data, (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol)),
+ (node vt:$data, (ScratchSVAddr (i32 VGPROp_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol)),
(inst getVregSrcForVT<vt>.ret:$data, $vaddr, $saddr, $offset, $cpol)
>;
class ScratchLoadSVaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol), vt:$in)),
+ (vt (node (ScratchSVAddr (i32 VGPROp_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol), vt:$in)),
(inst $vaddr, $saddr, $offset, $cpol, $in)
>;
class ScratchLoadSVaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol))),
+ (vt (node (ScratchSVAddr (i32 VGPROp_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol))),
(inst $vaddr, $saddr, $offset, $cpol)
>;
class ScratchLoadSVaddrPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol))),
+ (vt (node (ScratchSVAddr (i32 VGPROp_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol))),
(EXTRACT_SUBREG (inst $vaddr, $saddr, $offset, $cpol), lo16)
>;
@@ -2169,14 +2172,16 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_MIN_F64", "atomic_load_fmin_"#as, f64>;
defm : FlatAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_"#as, f64>;
}
+let SubtargetPredicate = HasAtomicCondSubClampFlatInsts in {
+ defm : FlatAtomicRtnPat<"FLAT_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_" #as, i32 >;
+
+ defm : FlatAtomicNoRtnPat<"FLAT_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_"#as, i32>;
+}
} // end foreach as
defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
defm : FlatStorePats <FLAT_STORE_SHORT, store_flat, i16>;
-defm : FlatAtomicRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>;
-defm : FlatAtomicNoRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>;
-
let OtherPredicates = [HasD16LoadStore] in {
defm : FlatStorePats <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>;
defm : FlatStorePats <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>;
@@ -2340,10 +2345,10 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_OR", "atomic_load_or_global", i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SWAP", "atomic_swap_global", i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP", "AMDGPUatomic_cmp_swap_global", i32, v2i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR", "atomic_load_xor_global", i32>;
-defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_csub", i32, i32, /* isIntr */ 1>;
+defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_CSUB", "atomic_load_usub_sat_global", i32>;
let SubtargetPredicate = HasAtomicCSubNoRtnInsts in
-defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_csub", i32, i32, /* isIntr */ 1>;
+defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_CSUB", "atomic_load_usub_sat_global", i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_X2", "atomic_load_add_global", i64>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB_X2", "atomic_load_sub_global", i64>;
@@ -2360,10 +2365,8 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP_X2", "AMDGPUatomic_cmp_swap_
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", "atomic_load_xor_global", i64>;
let SubtargetPredicate = isGFX12Plus in {
- defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "global_addrspace", i32>;
-
- let SubtargetPredicate = HasAtomicCSubNoRtnInsts in
- defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "global_addrspace", i32>;
+ defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_global", i32>;
+ defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_global", i32>;
}
let OtherPredicates = [isGFX12PlusNot12_50] in
@@ -2387,13 +2390,13 @@ let WaveSizePredicate = isWave32, OtherPredicates = [HasTransposeLoadF4F6Insts]
}
let OtherPredicates = [isGFX125xOnly] in {
- def : FlatLoadPat_CPOL <FLAT_LOAD_MONITOR_B32, int_amdgcn_flat_load_monitor_b32, i32>;
- def : FlatLoadPat_CPOL <FLAT_LOAD_MONITOR_B64, int_amdgcn_flat_load_monitor_b64, v2i32>;
- def : FlatLoadPat_CPOL <FLAT_LOAD_MONITOR_B128, int_amdgcn_flat_load_monitor_b128, v4i32>;
+ def : FlatLoadPat <FLAT_LOAD_MONITOR_B32, AMDGPUflat_load_monitor, i32>;
+ def : FlatLoadPat <FLAT_LOAD_MONITOR_B64, AMDGPUflat_load_monitor, v2i32>;
+ def : FlatLoadPat <FLAT_LOAD_MONITOR_B128, AMDGPUflat_load_monitor, v4i32>;
- defm : GlobalFLATLoadPats_CPOL <GLOBAL_LOAD_MONITOR_B32, int_amdgcn_global_load_monitor_b32, i32>;
- defm : GlobalFLATLoadPats_CPOL <GLOBAL_LOAD_MONITOR_B64, int_amdgcn_global_load_monitor_b64, v2i32>;
- defm : GlobalFLATLoadPats_CPOL <GLOBAL_LOAD_MONITOR_B128, int_amdgcn_global_load_monitor_b128, v4i32>;
+ defm : GlobalFLATLoadPats <GLOBAL_LOAD_MONITOR_B32, AMDGPUglobal_load_monitor, i32>;
+ defm : GlobalFLATLoadPats <GLOBAL_LOAD_MONITOR_B64, AMDGPUglobal_load_monitor, v2i32>;
+ defm : GlobalFLATLoadPats <GLOBAL_LOAD_MONITOR_B128, AMDGPUglobal_load_monitor, v4i32>;
} // End SubtargetPredicate = isGFX125xOnly
let OtherPredicates = [isGFX1250Plus] in {
@@ -2450,7 +2453,7 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_flat", v2bf16
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_global", v2bf16>;
-let OtherPredicates = [HasFlatScratchInsts, EnableFlatScratch] in {
+let OtherPredicates = [HasFlatScratchInsts, HasFlatScratchEnabled] in {
defm : ScratchFLATLoadPats <SCRATCH_LOAD_UBYTE, extloadi8_private, i32>;
defm : ScratchFLATLoadPats <SCRATCH_LOAD_UBYTE, zextloadi8_private, i32>;
@@ -2508,12 +2511,12 @@ defm : ScratchFLATStorePats <SCRATCH_STORE_BYTE, truncstorei8_private, i32>;
defm : ScratchFLATStorePats <SCRATCH_STORE_SHORT, truncstorei16_private, i32>;
defm : ScratchFLATStorePats <SCRATCH_STORE_DWORDX3, store_private, v3i32>;
-let OtherPredicates = [HasD16LoadStore, HasFlatScratchInsts, EnableFlatScratch] in {
+let OtherPredicates = [HasD16LoadStore, HasFlatScratchInsts, HasFlatScratchEnabled] in {
defm : ScratchFLATStorePats <SCRATCH_STORE_SHORT_D16_HI, truncstorei16_hi16_private, i32>;
defm : ScratchFLATStorePats <SCRATCH_STORE_BYTE_D16_HI, truncstorei8_hi16_private, i32>;
}
-let OtherPredicates = [D16PreservesUnusedBits, HasFlatScratchInsts, EnableFlatScratch] in {
+let OtherPredicates = [D16PreservesUnusedBits, HasFlatScratchInsts, HasFlatScratchEnabled] in {
defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_private, v2i16>;
defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_private, v2f16>;
defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_private, v2i16>;
@@ -2529,7 +2532,7 @@ defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SHORT_D16, load_d16_lo_private, v2i
defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SHORT_D16, load_d16_lo_private, v2f16>;
}
-} // End OtherPredicates = [HasFlatScratchInsts,EnableFlatScratch]
+} // End OtherPredicates = [HasFlatScratchInsts,HasFlatScratchEnabled]
def PrefetchLoc: SDNodeXForm<timm, [{
uint32_t V = N->getZExtValue();
@@ -2568,7 +2571,7 @@ multiclass FlatPrefetchPats<string inst, SDPatternOperator prefetch_kind, SDPatt
}
def : GCNPat <
- (prefetch_kind (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), rw, (i32 timm:$loc), i32imm_one),
+ (prefetch_kind (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), i32:$offset), rw, (i32 timm:$loc), i32imm_one),
(!cast<FLAT_Pseudo>(inst#"_SADDR") $saddr, $voffset, $offset, (i32 (PrefetchLoc $loc)))
> {
let AddedComplexity = !if(!eq(rw, i32imm_zero), 11, 30);
@@ -2582,7 +2585,7 @@ multiclass FlatIntrPrefetchPats<string inst, SDPatternOperator intr> {
>;
def : GCNPat <
- (intr (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), timm:$cpol),
+ (intr (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), i32:$offset), timm:$cpol),
(!cast<FLAT_Pseudo>(inst#"_SADDR") $saddr, $voffset, $offset, $cpol)> {
let AddedComplexity = 11;
}
@@ -3642,17 +3645,6 @@ multiclass VFLAT_Real_gfx1250<bits<8> op,
}
}
-multiclass VFLAT_Aliases_gfx1250<string name> {
- defvar ps = get_FLAT_ps<NAME>;
- if !ne(ps.Mnemonic, name) then
- def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX125xOnly]>;
-}
-
-multiclass VFLAT_Real_Base_gfx1250<bits<8> op, string name = get_FLAT_ps<NAME>.Mnemonic> :
- VFLAT_Aliases_gfx1250<name> {
- defm "" : VFLAT_Real_gfx1250<op, name>;
-}
-
multiclass VFLAT_Real_RTN_gfx1250<bits<8> op, string name> {
defm _RTN : VFLAT_Real_gfx1250<op, name>;
}
@@ -3665,9 +3657,14 @@ multiclass VFLAT_Real_SADDR_RTN_gfx1250<bits<8> op, string name> {
defm _SADDR_RTN : VFLAT_Real_gfx1250<op, name>;
}
-multiclass VFLAT_Real_AllAddr_gfx1250<bits<8> op, string name = get_FLAT_ps<NAME>.Mnemonic> :
- VFLAT_Real_Base_gfx1250<op, name>,
- VFLAT_Real_SADDR_gfx1250<op, name>;
+multiclass VFLAT_Real_AllAddr_gfx1250<bits<8> op, string name = get_FLAT_ps<NAME>.Mnemonic,
+ list<Predicate> aliasPreds = [isGFX125xOnly]> :
+ VFLAT_Real_gfx1250<op, name>,
+ VFLAT_Real_SADDR_gfx1250<op, name> {
+ defvar ps = get_FLAT_ps<NAME>;
+ if !ne(ps.Mnemonic, name) then
+ def : MnemonicAlias<ps.Mnemonic, name>, Requires<aliasPreds>;
+}
multiclass VFLAT_Real_Atomics_gfx1250<bits<8> op, string name = get_FLAT_ps<NAME>.Mnemonic> :
VFLAT_Real_AllAddr_gfx1250<op, name>,
@@ -3711,6 +3708,12 @@ defm GLOBAL_LOAD_TR_B64_w32 : VFLAT_Real_AllAddr_gfx1250<0x058, "globa
defm GLOBAL_LOAD_TR4_B64 : VFLAT_Real_AllAddr_gfx1250<0x073>;
defm GLOBAL_LOAD_TR6_B96 : VFLAT_Real_AllAddr_gfx1250<0x074>;
+// Additional aliases for global load transpose instructions.
+def : MnemonicAlias<"global_load_b128_tr_b16", "global_load_tr16_b128">, Requires<[isGFX125xOnly]>;
+def : MnemonicAlias<"global_load_b64_tr_b8", "global_load_tr8_b64">, Requires<[isGFX125xOnly]>;
+def : MnemonicAlias<"global_load_b64_tr_b4", "global_load_tr4_b64">, Requires<[isGFX125xOnly]>;
+def : MnemonicAlias<"global_load_b96_tr_b6", "global_load_tr6_b96">, Requires<[isGFX125xOnly]>;
+
defm FLAT_ATOMIC_ADD_F64 : VFLAT_Real_Atomics_gfx1250<0x055>;
defm FLAT_ATOMIC_MIN_F64 : VFLAT_Real_Atomics_gfx1250<0x05b, "flat_atomic_min_num_f64">;
defm FLAT_ATOMIC_MAX_F64 : VFLAT_Real_Atomics_gfx1250<0x05c, "flat_atomic_max_num_f64">;
diff --git a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp
index 27f40f1..72805aa 100644
--- a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp
@@ -134,6 +134,7 @@ public:
LLVM_DEBUG(dbgs() << "CreateVOPD Pass:\n");
const SIInstrInfo *SII = ST->getInstrInfo();
+ const SIRegisterInfo *TRI = ST->getRegisterInfo();
bool Changed = false;
unsigned EncodingFamily = AMDGPU::getVOPDEncodingFamily(*ST);
bool HasVOPD3 = ST->hasVOPD3();
@@ -160,16 +161,25 @@ public:
llvm::AMDGPU::CanBeVOPD SecondCanBeVOPD =
AMDGPU::getCanBeVOPD(Opc2, EncodingFamily, VOPD3);
- if (FirstCanBeVOPD.X && SecondCanBeVOPD.Y)
+ if (FirstCanBeVOPD.X && SecondCanBeVOPD.Y &&
+ llvm::checkVOPDRegConstraints(*SII, *FirstMI, *SecondMI, VOPD3)) {
CI = VOPDCombineInfo(FirstMI, SecondMI, VOPD3);
- else if (FirstCanBeVOPD.Y && SecondCanBeVOPD.X)
+ return true;
+ }
+ // We can try swapping the order of the instructions, but in that case
+ // neither instruction can write to a register the other reads from.
+ // OpX cannot write something OpY reads because that is the hardware
+ // rule, and OpY cannot write what OpX reads because that would
+ // violate the data dependency in the original order.
+ for (const auto &Use : SecondMI->uses())
+ if (Use.isReg() && FirstMI->modifiesRegister(Use.getReg(), TRI))
+ return false;
+ if (FirstCanBeVOPD.Y && SecondCanBeVOPD.X &&
+ llvm::checkVOPDRegConstraints(*SII, *SecondMI, *FirstMI, VOPD3)) {
CI = VOPDCombineInfo(SecondMI, FirstMI, VOPD3);
- else
- return false;
- // checkVOPDRegConstraints cares about program order, but doReplace
- // cares about X-Y order in the constituted VOPD
- return llvm::checkVOPDRegConstraints(*SII, *FirstMI, *SecondMI,
- VOPD3);
+ return true;
+ }
+ return false;
};
if (checkVOPD(false) || (HasVOPD3 && checkVOPD(true))) {
diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
index 464cbec..6ba669f 100644
--- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -43,6 +43,7 @@
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
using namespace llvm;
@@ -256,7 +257,7 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg()),
*MRI));
auto *Def = getVRegSubRegDef(CombOldVGPR, *MRI);
- DPPInst.addReg(CombOldVGPR.Reg, Def ? 0 : RegState::Undef,
+ DPPInst.addReg(CombOldVGPR.Reg, getUndefRegState(!Def),
CombOldVGPR.SubReg);
++NumOperands;
} else if (TII->isVOPC(DPPOp) || (TII->isVOP3(DPPOp) && OrigOpE32 != -1 &&
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index a911e7e..d504d86 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -16,6 +16,7 @@
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/ScheduleDAG.h"
#include "llvm/TargetParser/TargetParser.h"
@@ -190,6 +191,12 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
if (checkFPAtomicToDenormModeHazard(MI) > 0)
return HazardType;
+ // Hazards which cannot be mitigated with S_NOPs.
+ if (!IsHazardRecognizerMode) {
+ if (checkWMMACoexecutionHazards(MI) > 0)
+ return Hazard;
+ }
+
if (ST.hasNoDataDepHazard())
return NoHazard;
@@ -435,10 +442,7 @@ void GCNHazardRecognizer::RecedeCycle() {
// Helper Functions
//===----------------------------------------------------------------------===//
-using HazardFnResult = enum { HazardFound, HazardExpired, NoHazardFound };
-
-using IsExpiredFn = function_ref<bool(const MachineInstr &, int WaitStates)>;
-using GetNumWaitStatesFn = function_ref<unsigned int(const MachineInstr &)>;
+enum HazardFnResult { HazardFound, HazardExpired, NoHazardFound };
// Search for a hazard in a block and its predecessors.
template <typename StateT>
@@ -546,11 +550,14 @@ hasHazard(StateT InitialState,
// Returns a minimum wait states since \p I walking all predecessors.
// Only scans until \p IsExpired does not return true.
// Can only be run in a hazard recognizer mode.
-static int getWaitStatesSince(
- GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB,
- MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates,
- IsExpiredFn IsExpired, DenseSet<const MachineBasicBlock *> &Visited,
- GetNumWaitStatesFn GetNumWaitStates = SIInstrInfo::getNumWaitStates) {
+static int
+getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
+ const MachineBasicBlock *MBB,
+ MachineBasicBlock::const_reverse_instr_iterator I,
+ int WaitStates, GCNHazardRecognizer::IsExpiredFn IsExpired,
+ DenseSet<const MachineBasicBlock *> &Visited,
+ GCNHazardRecognizer::GetNumWaitStatesFn GetNumWaitStates =
+ SIInstrInfo::getNumWaitStates) {
for (auto E = MBB->instr_rend(); I != E; ++I) {
// Don't add WaitStates for parent BUNDLE instructions.
if (I->isBundle())
@@ -582,20 +589,26 @@ static int getWaitStatesSince(
return MinWaitStates;
}
-static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
- const MachineInstr *MI, IsExpiredFn IsExpired) {
+static int
+getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
+ const MachineInstr *MI,
+ GCNHazardRecognizer::IsExpiredFn IsExpired,
+ GCNHazardRecognizer::GetNumWaitStatesFn GetNumWaitStates =
+ SIInstrInfo::getNumWaitStates) {
DenseSet<const MachineBasicBlock *> Visited;
return getWaitStatesSince(IsHazard, MI->getParent(),
std::next(MI->getReverseIterator()), 0, IsExpired,
- Visited, SIInstrInfo::getNumWaitStates);
+ Visited, GetNumWaitStates);
}
-int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
+int GCNHazardRecognizer::getWaitStatesSince(
+ IsHazardFn IsHazard, int Limit, GetNumWaitStatesFn GetNumWaitStates) {
if (IsHazardRecognizerMode) {
auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
return WaitStates >= Limit;
};
- return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
+ return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn,
+ GetNumWaitStates);
}
int WaitStates = 0;
@@ -607,7 +620,7 @@ int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
if (MI->isInlineAsm())
continue;
}
- ++WaitStates;
+ WaitStates += MI ? GetNumWaitStates(*MI) : 1;
if (WaitStates >= Limit)
break;
@@ -615,6 +628,10 @@ int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
return std::numeric_limits<int>::max();
}
+int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
+ return getWaitStatesSince(IsHazard, Limit, SIInstrInfo::getNumWaitStates);
+}
+
int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
IsHazardFn IsHazardDef,
int Limit) {
@@ -643,7 +660,7 @@ int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
MCRegister Reg) {
for (MCRegUnit Unit : TRI.regunits(Reg))
- BV.set(Unit);
+ BV.set(static_cast<unsigned>(Unit));
}
static void addRegsToSet(const SIRegisterInfo &TRI,
@@ -1243,6 +1260,20 @@ int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
}
+// emit V_NOP instructions. \p WaitStatesNeeded is the number of V_NOPs we need
+// to insert, negative means not needed.
+bool GCNHazardRecognizer::emitVNops(MachineInstr *MI, int WaitStatesNeeded) {
+ if (WaitStatesNeeded <= 0)
+ return false;
+
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ for (int I = 0; I < WaitStatesNeeded; ++I)
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+ TII->get(AMDGPU::V_NOP_e32));
+
+ return true;
+}
+
void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
fixVMEMtoScalarWriteHazards(MI);
fixVcmpxPermlaneHazards(MI);
@@ -1257,7 +1288,7 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
fixVALUTransUseHazard(MI);
fixVALUTransCoexecutionHazards(MI);
fixWMMAHazards(MI); // fall-through if co-execution is enabled.
- fixWMMACoexecutionHazards(MI);
+ emitVNops(MI, checkWMMACoexecutionHazards(MI));
fixShift64HighRegBug(MI);
fixVALUMaskWriteHazard(MI);
fixRequiredExportPriority(MI);
@@ -1306,8 +1337,8 @@ bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
bool IsUndef = Src0->isUndef();
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
TII->get(AMDGPU::V_MOV_B32_e32))
- .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
- .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
+ .addReg(Reg, RegState::Define | getDeadRegState(IsUndef))
+ .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
return true;
}
@@ -1354,7 +1385,7 @@ bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
const SIInstrInfo *TII = ST.getInstrInfo();
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
TII->get(AMDGPU::S_WAITCNT_DEPCTR))
- .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
+ .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0, ST));
return true;
}
@@ -1487,7 +1518,7 @@ bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
TII->get(AMDGPU::S_WAITCNT_DEPCTR))
- .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
+ .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0, ST));
return true;
}
@@ -1502,9 +1533,8 @@ static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
bool HasVmem = false;
for (auto &MBB : MF) {
for (auto &MI : MBB) {
- HasLds |= SIInstrInfo::isDS(MI);
- HasVmem |= (SIInstrInfo::isVMEM(MI) && !SIInstrInfo::isFLAT(MI)) ||
- SIInstrInfo::isSegmentSpecificFLAT(MI);
+ HasLds |= SIInstrInfo::isDS(MI) || SIInstrInfo::isLDSDMA(MI);
+ HasVmem |= SIInstrInfo::isVMEM(MI);
if (HasLds && HasVmem)
return true;
}
@@ -1526,10 +1556,9 @@ bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
assert(!ST.hasExtendedWaitCounts());
auto IsHazardInst = [](const MachineInstr &MI) {
- if (SIInstrInfo::isDS(MI))
+ if (SIInstrInfo::isDS(MI) || SIInstrInfo::isLDSDMA(MI))
return 1;
- if ((SIInstrInfo::isVMEM(MI) && !SIInstrInfo::isFLAT(MI)) ||
- SIInstrInfo::isSegmentSpecificFLAT(MI))
+ if (SIInstrInfo::isVMEM(MI))
return 2;
return 0;
};
@@ -1653,7 +1682,7 @@ bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
} else {
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
TII.get(AMDGPU::S_WAITCNT_DEPCTR))
- .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
+ .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0, ST));
}
return true;
@@ -1811,7 +1840,7 @@ bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
TII.get(AMDGPU::S_WAITCNT_DEPCTR))
- .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0));
+ .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0, ST));
return true;
}
@@ -1897,13 +1926,13 @@ bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
// avoided.
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
TII.get(AMDGPU::S_WAITCNT_DEPCTR))
- .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0));
+ .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0, ST));
return true;
}
bool GCNHazardRecognizer::fixVALUTransCoexecutionHazards(MachineInstr *MI) {
- if (!AMDGPU::isGFX1250(ST) || // Coexecution disabled.
+ if (!ST.hasGFX1250Insts() || // Coexecution disabled.
!SIInstrInfo::isVALU(*MI) || SIInstrInfo::isTRANS(*MI))
return false;
@@ -2047,13 +2076,13 @@ static bool IsWMMAHazardInstInCategory(const MachineInstr &MI,
return false;
}
-bool GCNHazardRecognizer::fixWMMACoexecutionHazards(MachineInstr *MI) {
- if (!AMDGPU::isGFX1250(ST))
- return false;
+int GCNHazardRecognizer::checkWMMACoexecutionHazards(MachineInstr *MI) {
+ if (!ST.hasGFX1250Insts())
+ return 0;
const SIInstrInfo *TII = ST.getInstrInfo();
if (!TII->isXDLWMMA(*MI) && !isCoexecutableVALUInst(*MI))
- return false;
+ return 0;
const SIRegisterInfo *TRI = ST.getRegisterInfo();
@@ -2131,9 +2160,6 @@ bool GCNHazardRecognizer::fixWMMACoexecutionHazards(MachineInstr *MI) {
};
int Limit = 0;
- auto IsExpiredFn = [&Limit](const MachineInstr &, int WaitStates) {
- return WaitStates >= Limit;
- };
auto GetWaitStatesFn = [](const MachineInstr &I) {
return SIInstrInfo::isVALU(I) ? 1 : 0;
@@ -2143,38 +2169,26 @@ bool GCNHazardRecognizer::fixWMMACoexecutionHazards(MachineInstr *MI) {
if (TII->isXDLWMMA(*MI)) {
for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
Limit = WMMAWaitStates[Category]; // for IsExpiredFn.
- DenseSet<const MachineBasicBlock *> Visited;
- // '::getWaitStatesSince' returns the number of VALUs in between if hazard
+ // 'getWaitStatesSince' returns the number of VALUs in between if hazard
// exists, and INT_MAX if there is no hazard. As a result, a negative
// WaitStatesNeeded here means no hazard, and we will continue to search
// for other categories.
WaitStatesNeeded =
- Limit - ::getWaitStatesSince(IsWMMAHazardFn, MI->getParent(),
- std::next(MI->getReverseIterator()), 0,
- IsExpiredFn, Visited, GetWaitStatesFn);
+ Limit - getWaitStatesSince(IsWMMAHazardFn, Limit, GetWaitStatesFn);
}
} else { // Must be a co-executable VALU.
for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
Limit = VALUWaitStates[Category]; // for IsExpiredFn.
- DenseSet<const MachineBasicBlock *> Visited;
- // '::getWaitStatesSince' returns the number of VALUs in between if hazard
+ // 'getWaitStatesSince' returns the number of VALUs in between if hazard
// exists, and INT_MAX if there is no hazard. As a result, a negative
// WaitStatesNeeded here means no hazard, and we will continue to search
// for other categories.
WaitStatesNeeded =
- Limit - ::getWaitStatesSince(IsVALUHazardFn, MI->getParent(),
- std::next(MI->getReverseIterator()), 0,
- IsExpiredFn, Visited, GetWaitStatesFn);
+ Limit - getWaitStatesSince(IsVALUHazardFn, Limit, GetWaitStatesFn);
}
}
- // WaitStatesNeeded now is the number of V_NOPs we need to insert, negative
- // means not needed.
- for (int i = 0; i < WaitStatesNeeded; i++)
- BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
- TII->get(AMDGPU::V_NOP_e32));
-
- return true;
+ return WaitStatesNeeded;
}
bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
@@ -2204,16 +2218,33 @@ bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1))
return false;
- MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
- bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg);
- bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI);
- bool Overlapped = OverlappedSrc || OverlappedDst;
-
- assert(!OverlappedDst || !OverlappedSrc ||
- Src1->getReg() == MI->getOperand(0).getReg());
assert(ST.needsAlignedVGPRs());
static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
+ const DebugLoc &DL = MI->getDebugLoc();
+ MachineBasicBlock *MBB = MI->getParent();
+ MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
+
+ // In:
+ //
+ // Dst = shiftrev64 Amt, Src1
+ //
+ // if Dst!=Src1 then avoid the bug with:
+ //
+ // Dst.sub0 = Amt
+ // Dst = shift64 Dst.sub0, Src1
+
+ Register DstReg = MI->getOperand(0).getReg();
+ if (!Src1->isReg() || Src1->getReg() != DstReg) {
+ Register DstLo = TRI.getSubReg(DstReg, AMDGPU::sub0);
+ runOnInstruction(
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), DstLo).add(*Amt));
+ Amt->setReg(DstLo);
+ Amt->setIsKill(true);
+ return true;
+ }
+
+ bool Overlapped = MI->modifiesRegister(AmtReg, &TRI);
Register NewReg;
for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
: AMDGPU::VGPR_32RegClass) {
@@ -2230,8 +2261,6 @@ bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
if (Overlapped)
NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
- DebugLoc DL = MI->getDebugLoc();
- MachineBasicBlock *MBB = MI->getParent();
// Insert a full wait count because found register might be pending a wait.
BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT))
.addImm(0);
@@ -2269,9 +2298,8 @@ bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
Amt->setIsKill(false);
// We do not update liveness, so verifier may see it as undef.
Amt->setIsUndef();
- if (OverlappedDst)
+ if (Overlapped) {
MI->getOperand(0).setReg(NewReg);
- if (OverlappedSrc) {
Src1->setReg(NewReg);
Src1->setIsKill(false);
Src1->setIsUndef();
@@ -3267,29 +3295,103 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
return false;
assert(!ST.hasExtendedWaitCounts());
- if (!ST.isWave64() || !SIInstrInfo::isSALU(*MI))
+ if (!ST.isWave64())
+ return false;
+
+ const bool IsSALU = SIInstrInfo::isSALU(*MI);
+ const bool IsVALU = SIInstrInfo::isVALU(*MI);
+ if (!IsSALU && !IsVALU)
return false;
// The hazard sequence is three instructions:
// 1. VALU reads SGPR as mask
- // 2. SALU writes SGPR
- // 3. SALU reads SGPR
- // The hazard can expire if the distance between 2 and 3 is sufficient.
- // In practice this happens <10% of the time, hence this always assumes
- // the hazard exists if 1 and 2 are present to avoid searching.
+ // 2. VALU/SALU writes SGPR
+ // 3. VALU/SALU reads SGPR
+ // The hazard can expire if the distance between 2 and 3 is sufficient,
+ // or (2) is VALU and (3) is SALU.
+ // In practice this happens <10% of the time, hence always assume the hazard
+ // exists if (1) and (2) are present to avoid searching all SGPR reads.
- const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
- if (!SDSTOp || !SDSTOp->isReg())
- return false;
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ auto IgnoreableSGPR = [](const Register Reg) {
+ switch (Reg) {
+ case AMDGPU::EXEC:
+ case AMDGPU::EXEC_LO:
+ case AMDGPU::EXEC_HI:
+ case AMDGPU::M0:
+ case AMDGPU::SGPR_NULL:
+ case AMDGPU::SGPR_NULL64:
+ case AMDGPU::SCC:
+ return true;
+ default:
+ return false;
+ }
+ };
+ auto IsVCC = [](const Register Reg) {
+ return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI;
+ };
- const Register HazardReg = SDSTOp->getReg();
- if (HazardReg == AMDGPU::EXEC ||
- HazardReg == AMDGPU::EXEC_LO ||
- HazardReg == AMDGPU::EXEC_HI ||
- HazardReg == AMDGPU::M0)
+ struct StateType {
+ SmallSet<Register, 2> HazardSGPRs;
+
+ static unsigned getHashValue(const StateType &State) {
+ return hash_combine_range(State.HazardSGPRs);
+ }
+ static bool isEqual(const StateType &LHS, const StateType &RHS) {
+ return LHS.HazardSGPRs == RHS.HazardSGPRs;
+ }
+ };
+
+ SmallVector<const MachineInstr *> WaitInstrs;
+ bool HasSGPRRead = false;
+ StateType InitialState;
+
+ // Look for SGPR write.
+ MachineOperand *HazardDef = nullptr;
+ for (MachineOperand &Op : MI->operands()) {
+ if (!Op.isReg())
+ continue;
+ if (Op.isDef() && HazardDef)
+ continue;
+
+ Register Reg = Op.getReg();
+ if (IgnoreableSGPR(Reg))
+ continue;
+ if (!IsVCC(Reg)) {
+ if (Op.isImplicit())
+ continue;
+ if (!TRI->isSGPRReg(MRI, Reg))
+ continue;
+ }
+ // Also check for SGPR reads.
+ if (Op.isUse()) {
+ HasSGPRRead = true;
+ continue;
+ }
+
+ assert(!HazardDef);
+ HazardDef = &Op;
+ }
+
+ if (!HazardDef)
return false;
- auto IsHazardFn = [HazardReg, this](const MachineInstr &I) {
+ // Setup to track writes to individual SGPRs
+ const Register HazardReg = HazardDef->getReg();
+ if (AMDGPU::SReg_32RegClass.contains(HazardReg)) {
+ InitialState.HazardSGPRs.insert(HazardReg);
+ } else {
+ assert(AMDGPU::SReg_64RegClass.contains(HazardReg));
+ InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub0));
+ InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub1));
+ }
+
+ auto IsHazardFn = [&](StateType &State, const MachineInstr &I) {
+ if (State.HazardSGPRs.empty())
+ return HazardExpired;
+
switch (I.getOpcode()) {
case AMDGPU::V_ADDC_U32_e32:
case AMDGPU::V_ADDC_U32_dpp:
@@ -3304,11 +3406,10 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
case AMDGPU::V_SUBB_U32_e32:
case AMDGPU::V_SUBB_U32_dpp:
case AMDGPU::V_SUBBREV_U32_e32:
- case AMDGPU::V_SUBBREV_U32_dpp:
+ case AMDGPU::V_SUBBREV_U32_dpp: {
// These implicitly read VCC as mask source.
- return HazardReg == AMDGPU::VCC ||
- HazardReg == AMDGPU::VCC_LO ||
- HazardReg == AMDGPU::VCC_HI;
+ return IsVCC(HazardReg) ? HazardFound : NoHazardFound;
+ }
case AMDGPU::V_ADDC_U32_e64:
case AMDGPU::V_ADDC_U32_e64_dpp:
case AMDGPU::V_CNDMASK_B16_t16_e64:
@@ -3324,68 +3425,110 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
// Only check mask register overlaps.
const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
assert(SSRCOp);
- return TRI.regsOverlap(SSRCOp->getReg(), HazardReg);
+ bool Result = TRI->regsOverlap(SSRCOp->getReg(), HazardReg);
+ return Result ? HazardFound : NoHazardFound;
}
default:
- return false;
+ return NoHazardFound;
}
};
- const MachineRegisterInfo &MRI = MF.getRegInfo();
- auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) {
- // s_waitcnt_depctr sa_sdst(0) mitigates hazard.
- if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
- AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0)
- return true;
-
- // VALU access to any SGPR or literal constant other than HazardReg
- // mitigates hazard. No need to check HazardReg here as this will
- // only be called when !IsHazardFn.
- if (!SIInstrInfo::isVALU(I))
- return false;
- for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) {
- const MachineOperand &Op = I.getOperand(OpNo);
- if (Op.isReg()) {
- Register OpReg = Op.getReg();
- // Only consider uses
- if (!Op.isUse())
+ const unsigned ConstantMaskBits = AMDGPU::DepCtr::encodeFieldSaSdst(
+ AMDGPU::DepCtr::encodeFieldVaSdst(AMDGPU::DepCtr::encodeFieldVaVcc(0, ST),
+ 0),
+ 0);
+ auto UpdateStateFn = [&](StateType &State, const MachineInstr &I) {
+ switch (I.getOpcode()) {
+ case AMDGPU::S_WAITCNT_DEPCTR:
+ // Record mergable waits within region of instructions free of SGPR reads.
+ if (!HasSGPRRead && I.getParent() == MI->getParent() && !I.isBundled() &&
+ (I.getOperand(0).getImm() & ConstantMaskBits) == ConstantMaskBits)
+ WaitInstrs.push_back(&I);
+ break;
+ default:
+ // Update tracking of SGPR reads and writes.
+ for (auto &Op : I.operands()) {
+ if (!Op.isReg())
continue;
- // Ignore EXEC
- if (OpReg == AMDGPU::EXEC ||
- OpReg == AMDGPU::EXEC_LO ||
- OpReg == AMDGPU::EXEC_HI)
+
+ Register Reg = Op.getReg();
+ if (IgnoreableSGPR(Reg))
continue;
- // Ignore all implicit uses except VCC
- if (Op.isImplicit()) {
- if (OpReg == AMDGPU::VCC ||
- OpReg == AMDGPU::VCC_LO ||
- OpReg == AMDGPU::VCC_HI)
- return true;
+ if (!IsVCC(Reg)) {
+ if (Op.isImplicit())
+ continue;
+ if (!TRI->isSGPRReg(MRI, Reg))
+ continue;
+ }
+ if (Op.isUse()) {
+ HasSGPRRead = true;
continue;
}
- if (TRI.isSGPRReg(MRI, OpReg))
- return true;
- } else {
- const MCInstrDesc &InstDesc = I.getDesc();
- const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
- if (!TII.isInlineConstant(Op, OpInfo))
- return true;
+
+ // Stop tracking any SGPRs with writes on the basis that they will
+ // already have an appropriate wait inserted afterwards.
+ SmallVector<Register, 2> Found;
+ for (Register SGPR : State.HazardSGPRs) {
+ if (Reg == SGPR || TRI->regsOverlap(Reg, SGPR))
+ Found.push_back(SGPR);
+ }
+ for (Register SGPR : Found)
+ State.HazardSGPRs.erase(SGPR);
}
+ break;
}
- return false;
};
// Check for hazard
- if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
- std::numeric_limits<int>::max())
+ if (!hasHazard<StateType>(InitialState, IsHazardFn, UpdateStateFn,
+ MI->getParent(),
+ std::next(MI->getReverseIterator())))
return false;
- auto NextMI = std::next(MI->getIterator());
+ // Compute counter mask
+ unsigned DepCtr =
+ IsVALU ? (IsVCC(HazardReg) ? AMDGPU::DepCtr::encodeFieldVaVcc(0, ST)
+ : AMDGPU::DepCtr::encodeFieldVaSdst(0, ST))
+ : AMDGPU::DepCtr::encodeFieldSaSdst(0, ST);
+
+ // Try to merge previous waits into this one for regions with no SGPR reads.
+ if (!WaitInstrs.empty()) {
+ // Note: WaitInstrs contains const pointers, so walk backward from MI to
+ // obtain a mutable pointer to each instruction to be merged.
+ // This is expected to be a very short walk within the same block.
+ SmallVector<MachineInstr *> ToErase;
+ unsigned Found = 0;
+ for (MachineBasicBlock::reverse_iterator It = MI->getReverseIterator(),
+ End = MI->getParent()->rend();
+ Found < WaitInstrs.size() && It != End; ++It) {
+ MachineInstr *WaitMI = &*It;
+ // Find next wait instruction.
+ if (std::as_const(WaitMI) != WaitInstrs[Found])
+ continue;
+ Found++;
+ unsigned WaitMask = WaitMI->getOperand(0).getImm();
+ assert((WaitMask & ConstantMaskBits) == ConstantMaskBits);
+ DepCtr = AMDGPU::DepCtr::encodeFieldSaSdst(
+ DepCtr, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(WaitMask),
+ AMDGPU::DepCtr::decodeFieldSaSdst(DepCtr)));
+ DepCtr = AMDGPU::DepCtr::encodeFieldVaSdst(
+ DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaSdst(WaitMask),
+ AMDGPU::DepCtr::decodeFieldVaSdst(DepCtr)));
+ DepCtr = AMDGPU::DepCtr::encodeFieldVaVcc(
+ DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaVcc(WaitMask),
+ AMDGPU::DepCtr::decodeFieldVaVcc(DepCtr)));
+ ToErase.push_back(WaitMI);
+ }
+ assert(Found == WaitInstrs.size());
+ for (MachineInstr *WaitMI : ToErase)
+ WaitMI->eraseFromParent();
+ }
- // Add s_waitcnt_depctr sa_sdst(0) after SALU write.
+ // Add s_waitcnt_depctr after SGPR write.
+ auto NextMI = std::next(MI->getIterator());
auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
TII.get(AMDGPU::S_WAITCNT_DEPCTR))
- .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
+ .addImm(DepCtr);
// SALU write may be s_getpc in a bundle.
updateGetPCBundle(NewMI);
@@ -3531,10 +3674,10 @@ bool GCNHazardRecognizer::fixDsAtomicAsyncBarrierArriveB64(MachineInstr *MI) {
const SIInstrInfo *TII = ST.getInstrInfo();
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
TII->get(AMDGPU::S_WAITCNT_DEPCTR))
- .addImm(0xFFE3);
+ .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0, ST));
BuildMI(*MI->getParent(), std::next(MI->getIterator()), MI->getDebugLoc(),
TII->get(AMDGPU::S_WAITCNT_DEPCTR))
- .addImm(0xFFE3);
+ .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0, ST));
return true;
}
@@ -3611,7 +3754,7 @@ bool GCNHazardRecognizer::fixScratchBaseForwardingHazard(MachineInstr *MI) {
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
TII->get(AMDGPU::S_WAITCNT_DEPCTR))
.addImm(AMDGPU::DepCtr::encodeFieldVaSdst(
- AMDGPU::DepCtr::encodeFieldSaSdst(0), 0));
+ AMDGPU::DepCtr::encodeFieldSaSdst(0, ST), 0));
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index 67beffa..d725134 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -32,6 +32,8 @@ class GCNSubtarget;
class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
public:
typedef function_ref<bool(const MachineInstr &)> IsHazardFn;
+ typedef function_ref<bool(const MachineInstr &, int WaitStates)> IsExpiredFn;
+ typedef function_ref<unsigned int(const MachineInstr &)> GetNumWaitStatesFn;
private:
// Distinguish if we are called from scheduler or hazard recognizer
@@ -74,6 +76,8 @@ private:
// used on a newly inserted instruction before returning from PreEmitNoops.
void runOnInstruction(MachineInstr *MI);
+ int getWaitStatesSince(IsHazardFn IsHazard, int Limit,
+ GetNumWaitStatesFn GetNumWaitStates);
int getWaitStatesSince(IsHazardFn IsHazard, int Limit);
int getWaitStatesSinceDef(unsigned Reg, IsHazardFn IsHazardDef, int Limit);
int getWaitStatesSinceSetReg(IsHazardFn IsHazard, int Limit);
@@ -94,6 +98,9 @@ private:
int checkReadM0Hazards(MachineInstr *SMovRel);
int checkNSAtoVMEMHazard(MachineInstr *MI);
int checkFPAtomicToDenormModeHazard(MachineInstr *MI);
+ // Emit V_NOP instructions. \p WaitStatesNeeded is the number of V_NOPs we
+ // need to insert, negative means not needed.
+ bool emitVNops(MachineInstr *MI, int WaitStatesNeeded);
void fixHazards(MachineInstr *MI);
bool fixVcmpxPermlaneHazards(MachineInstr *MI);
bool fixVMEMtoScalarWriteHazards(MachineInstr *MI);
@@ -106,7 +113,7 @@ private:
bool fixVALUTransUseHazard(MachineInstr *MI);
bool fixVALUTransCoexecutionHazards(MachineInstr *MI);
bool fixWMMAHazards(MachineInstr *MI);
- bool fixWMMACoexecutionHazards(MachineInstr *MI);
+ int checkWMMACoexecutionHazards(MachineInstr *MI);
bool fixShift64HighRegBug(MachineInstr *MI);
bool fixVALUMaskWriteHazard(MachineInstr *MI);
bool fixRequiredExportPriority(MachineInstr *MI);
diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
index f253a84..dff153c 100644
--- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -381,10 +381,14 @@ void GCNIterativeScheduler::scheduleRegion(Region &R, Range &&Schedule,
auto Top = R.Begin;
for (const auto &I : Schedule) {
auto MI = getMachineInstr(I);
- if (MI != &*Top) {
+
+ MachineBasicBlock::iterator MII = MI->getIterator();
+ if (MII != Top) {
+ bool NonDebugReordered =
+ !MI->isDebugInstr() && skipDebugInstructionsForward(Top, MII) != MII;
BB->remove(MI);
BB->insert(Top, MI);
- if (!MI->isDebugInstr())
+ if (NonDebugReordered)
LIS->handleMove(*MI, true);
}
if (!MI->isDebugInstr()) {
diff --git a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
index 959ce69..5529808 100644
--- a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
@@ -43,7 +43,7 @@ public:
bool run(MachineFunction &MF);
private:
- using NSA_Status = enum {
+ enum NSA_Status {
NOT_NSA, // Not an NSA instruction
FIXED, // NSA which we cannot modify
NON_CONTIGUOUS, // NSA with non-sequential address which we can try
@@ -81,9 +81,7 @@ class GCNNSAReassignLegacy : public MachineFunctionPass {
public:
static char ID;
- GCNNSAReassignLegacy() : MachineFunctionPass(ID) {
- initializeGCNNSAReassignLegacyPass(*PassRegistry::getPassRegistry());
- }
+ GCNNSAReassignLegacy() : MachineFunctionPass(ID) {}
bool runOnMachineFunction(MachineFunction &MF) override;
diff --git a/llvm/lib/Target/AMDGPU/GCNPreRALongBranchReg.cpp b/llvm/lib/Target/AMDGPU/GCNPreRALongBranchReg.cpp
index 355bbeb..5e9ac56 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRALongBranchReg.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRALongBranchReg.cpp
@@ -57,9 +57,7 @@ public:
class GCNPreRALongBranchRegLegacy : public MachineFunctionPass {
public:
static char ID;
- GCNPreRALongBranchRegLegacy() : MachineFunctionPass(ID) {
- initializeGCNPreRALongBranchRegLegacyPass(*PassRegistry::getPassRegistry());
- }
+ GCNPreRALongBranchRegLegacy() : MachineFunctionPass(ID) {}
bool runOnMachineFunction(MachineFunction &MF) override {
return GCNPreRALongBranchReg().run(MF);
diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index 4deb2a9..cd56887 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -63,9 +63,7 @@ class GCNPreRAOptimizationsLegacy : public MachineFunctionPass {
public:
static char ID;
- GCNPreRAOptimizationsLegacy() : MachineFunctionPass(ID) {
- initializeGCNPreRAOptimizationsLegacyPass(*PassRegistry::getPassRegistry());
- }
+ GCNPreRAOptimizationsLegacy() : MachineFunctionPass(ID) {}
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -136,7 +134,7 @@ bool GCNPreRAOptimizationsImpl::processReg(Register Reg) {
continue;
if (Def.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
- MachineOperand DefSrcMO = Def.getOperand(1);
+ const MachineOperand &DefSrcMO = Def.getOperand(1);
// Immediates are not an issue and can be propagated in
// postrapseudos pass. Only handle cases where defining
@@ -270,15 +268,14 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
continue;
Register Dst = MI.getOperand(0).getReg();
Register Src = MI.getOperand(1).getReg();
- if (Dst.isVirtual() &&
- MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
- Src.isPhysical() &&
+ const TargetRegisterClass *DstRC = TRI->getRegClassForReg(*MRI, Dst);
+ bool IsDst16Bit = AMDGPU::VGPR_16RegClass.hasSubClassEq(DstRC);
+ if (Dst.isVirtual() && IsDst16Bit && Src.isPhysical() &&
TRI->getRegClassForReg(*MRI, Src) == &AMDGPU::VGPR_32RegClass)
MRI->setRegAllocationHint(Dst, 0, TRI->getSubReg(Src, AMDGPU::lo16));
if (Src.isVirtual() &&
MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass &&
- Dst.isPhysical() &&
- TRI->getRegClassForReg(*MRI, Dst) == &AMDGPU::VGPR_32RegClass)
+ Dst.isPhysical() && DstRC == &AMDGPU::VGPR_32RegClass)
MRI->setRegAllocationHint(Src, 0, TRI->getSubReg(Dst, AMDGPU::lo16));
if (!Dst.isVirtual() || !Src.isVirtual())
continue;
@@ -287,8 +284,7 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
MRI->setRegAllocationHint(Dst, AMDGPURI::Size32, Src);
MRI->setRegAllocationHint(Src, AMDGPURI::Size16, Dst);
}
- if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
- MRI->getRegClass(Src) == &AMDGPU::VGPR_32RegClass)
+ if (IsDst16Bit && MRI->getRegClass(Src) == &AMDGPU::VGPR_32RegClass)
MRI->setRegAllocationHint(Dst, AMDGPURI::Size16, Src);
}
}
diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td
index c6fb31f..9949208 100644
--- a/llvm/lib/Target/AMDGPU/GCNProcessors.td
+++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td
@@ -304,6 +304,10 @@ def : ProcessorModel<"gfx1153", GFX11SpeedModel,
FeatureISAVersion11_5_3.Features
>;
+def : ProcessorModel<"gfx1170", GFX11SpeedModel,
+ FeatureISAVersion11_7_0.Features
+>;
+
// [gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1152, gfx1153]
def : ProcessorModel<"gfx11-generic", GFX11SpeedModel,
FeatureISAVersion11_Generic.Features
@@ -333,3 +337,11 @@ def : ProcessorModel<"gfx1250", GFX1250SpeedModel,
def : ProcessorModel<"gfx1251", GFX1250SpeedModel,
FeatureISAVersion12_51.Features
>;
+
+//===----------------------------------------------------------------------===//
+// GCN GFX13.
+//===----------------------------------------------------------------------===//
+
+def : ProcessorModel<"gfx1310", GFX12SpeedModel,
+ FeatureISAVersion13.Features
+>;
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 4e11c4f..89307ef 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -97,6 +97,51 @@ void GCNRegPressure::inc(unsigned Reg,
Value[RegKind] += Sign;
}
+namespace {
+struct RegExcess {
+ unsigned SGPR = 0;
+ unsigned VGPR = 0;
+ unsigned ArchVGPR = 0;
+ unsigned AGPR = 0;
+
+ bool anyExcess() const { return SGPR || VGPR || ArchVGPR || AGPR; }
+ bool hasVectorRegisterExcess() const { return VGPR || ArchVGPR || AGPR; }
+
+ RegExcess(const MachineFunction &MF, const GCNRegPressure &RP)
+ : RegExcess(MF, RP, GCNRPTarget(MF, RP)) {}
+ RegExcess(const MachineFunction &MF, const GCNRegPressure &RP,
+ const GCNRPTarget &Target) {
+ unsigned MaxSGPRs = Target.getMaxSGPRs();
+ unsigned MaxVGPRs = Target.getMaxVGPRs();
+
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ SGPR = std::max(static_cast<int>(RP.getSGPRNum() - MaxSGPRs), 0);
+
+ // The number of virtual VGPRs required to handle excess SGPR
+ unsigned WaveSize = ST.getWavefrontSize();
+ unsigned VGPRForSGPRSpills = divideCeil(SGPR, WaveSize);
+
+ unsigned MaxArchVGPRs = ST.getAddressableNumArchVGPRs();
+
+ // Unified excess pressure conditions, accounting for VGPRs used for SGPR
+ // spills
+ VGPR = std::max(static_cast<int>(RP.getVGPRNum(ST.hasGFX90AInsts()) +
+ VGPRForSGPRSpills - MaxVGPRs),
+ 0);
+
+ unsigned ArchVGPRLimit = ST.hasGFX90AInsts() ? MaxArchVGPRs : MaxVGPRs;
+ // Arch VGPR excess pressure conditions, accounting for VGPRs used for SGPR
+ // spills
+ ArchVGPR = std::max(static_cast<int>(RP.getArchVGPRNum() +
+ VGPRForSGPRSpills - ArchVGPRLimit),
+ 0);
+
+ // AGPR excess pressure conditions
+ AGPR = std::max(static_cast<int>(RP.getAGPRNum() - ArchVGPRLimit), 0);
+ }
+};
+} // namespace
+
bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
unsigned MaxOccupancy) const {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
@@ -123,63 +168,25 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
return Occ > OtherOcc;
unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
- unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
- // SGPR excess pressure conditions
- unsigned ExcessSGPR = std::max(static_cast<int>(getSGPRNum() - MaxSGPRs), 0);
- unsigned OtherExcessSGPR =
- std::max(static_cast<int>(O.getSGPRNum() - MaxSGPRs), 0);
-
- auto WaveSize = ST.getWavefrontSize();
- // The number of virtual VGPRs required to handle excess SGPR
- unsigned VGPRForSGPRSpills = (ExcessSGPR + (WaveSize - 1)) / WaveSize;
- unsigned OtherVGPRForSGPRSpills =
- (OtherExcessSGPR + (WaveSize - 1)) / WaveSize;
+ RegExcess Excess(MF, *this);
+ RegExcess OtherExcess(MF, O);
unsigned MaxArchVGPRs = ST.getAddressableNumArchVGPRs();
- // Unified excess pressure conditions, accounting for VGPRs used for SGPR
- // spills
- unsigned ExcessVGPR =
- std::max(static_cast<int>(getVGPRNum(ST.hasGFX90AInsts()) +
- VGPRForSGPRSpills - MaxVGPRs),
- 0);
- unsigned OtherExcessVGPR =
- std::max(static_cast<int>(O.getVGPRNum(ST.hasGFX90AInsts()) +
- OtherVGPRForSGPRSpills - MaxVGPRs),
- 0);
- // Arch VGPR excess pressure conditions, accounting for VGPRs used for SGPR
- // spills
- unsigned ExcessArchVGPR = std::max(
- static_cast<int>(getVGPRNum(false) + VGPRForSGPRSpills - MaxArchVGPRs),
- 0);
- unsigned OtherExcessArchVGPR =
- std::max(static_cast<int>(O.getVGPRNum(false) + OtherVGPRForSGPRSpills -
- MaxArchVGPRs),
- 0);
- // AGPR excess pressure conditions
- unsigned ExcessAGPR = std::max(
- static_cast<int>(ST.hasGFX90AInsts() ? (getAGPRNum() - MaxArchVGPRs)
- : (getAGPRNum() - MaxVGPRs)),
- 0);
- unsigned OtherExcessAGPR = std::max(
- static_cast<int>(ST.hasGFX90AInsts() ? (O.getAGPRNum() - MaxArchVGPRs)
- : (O.getAGPRNum() - MaxVGPRs)),
- 0);
-
- bool ExcessRP = ExcessSGPR || ExcessVGPR || ExcessArchVGPR || ExcessAGPR;
- bool OtherExcessRP = OtherExcessSGPR || OtherExcessVGPR ||
- OtherExcessArchVGPR || OtherExcessAGPR;
+ bool ExcessRP = Excess.anyExcess();
+ bool OtherExcessRP = OtherExcess.anyExcess();
// Give second precedence to the reduced number of spills to hold the register
// pressure.
if (ExcessRP || OtherExcessRP) {
// The difference in excess VGPR pressure, after including VGPRs used for
// SGPR spills
- int VGPRDiff = ((OtherExcessVGPR + OtherExcessArchVGPR + OtherExcessAGPR) -
- (ExcessVGPR + ExcessArchVGPR + ExcessAGPR));
+ int VGPRDiff =
+ ((OtherExcess.VGPR + OtherExcess.ArchVGPR + OtherExcess.AGPR) -
+ (Excess.VGPR + Excess.ArchVGPR + Excess.AGPR));
- int SGPRDiff = OtherExcessSGPR - ExcessSGPR;
+ int SGPRDiff = OtherExcess.SGPR - Excess.SGPR;
if (VGPRDiff != 0)
return VGPRDiff > 0;
@@ -282,11 +289,12 @@ collectVirtualRegUses(SmallVectorImpl<VRegMaskOrUnit> &VRegMaskOrUnits,
Register Reg = MO.getReg();
auto I = llvm::find_if(VRegMaskOrUnits, [Reg](const VRegMaskOrUnit &RM) {
- return RM.RegUnit == Reg;
+ return RM.VRegOrUnit.asVirtualReg() == Reg;
});
auto &P = I == VRegMaskOrUnits.end()
- ? VRegMaskOrUnits.emplace_back(Reg, LaneBitmask::getNone())
+ ? VRegMaskOrUnits.emplace_back(VirtRegOrUnit(Reg),
+ LaneBitmask::getNone())
: *I;
P.LaneMask |= MO.getSubReg() ? TRI.getSubRegIndexLaneMask(MO.getSubReg())
@@ -295,7 +303,7 @@ collectVirtualRegUses(SmallVectorImpl<VRegMaskOrUnit> &VRegMaskOrUnits,
SlotIndex InstrSI;
for (auto &P : VRegMaskOrUnits) {
- auto &LI = LIS.getInterval(P.RegUnit);
+ auto &LI = LIS.getInterval(P.VRegOrUnit.asVirtualReg());
if (!LI.hasSubRanges())
continue;
@@ -312,29 +320,22 @@ collectVirtualRegUses(SmallVectorImpl<VRegMaskOrUnit> &VRegMaskOrUnits,
/// Mostly copy/paste from CodeGen/RegisterPressure.cpp
static LaneBitmask getLanesWithProperty(
const LiveIntervals &LIS, const MachineRegisterInfo &MRI,
- bool TrackLaneMasks, Register RegUnit, SlotIndex Pos,
- LaneBitmask SafeDefault,
+ bool TrackLaneMasks, Register Reg, SlotIndex Pos,
function_ref<bool(const LiveRange &LR, SlotIndex Pos)> Property) {
- if (RegUnit.isVirtual()) {
- const LiveInterval &LI = LIS.getInterval(RegUnit);
- LaneBitmask Result;
- if (TrackLaneMasks && LI.hasSubRanges()) {
- for (const LiveInterval::SubRange &SR : LI.subranges()) {
- if (Property(SR, Pos))
- Result |= SR.LaneMask;
- }
- } else if (Property(LI, Pos)) {
- Result = TrackLaneMasks ? MRI.getMaxLaneMaskForVReg(RegUnit)
- : LaneBitmask::getAll();
+ assert(Reg.isVirtual());
+ const LiveInterval &LI = LIS.getInterval(Reg);
+ LaneBitmask Result;
+ if (TrackLaneMasks && LI.hasSubRanges()) {
+ for (const LiveInterval::SubRange &SR : LI.subranges()) {
+ if (Property(SR, Pos))
+ Result |= SR.LaneMask;
}
-
- return Result;
+ } else if (Property(LI, Pos)) {
+ Result =
+ TrackLaneMasks ? MRI.getMaxLaneMaskForVReg(Reg) : LaneBitmask::getAll();
}
- const LiveRange *LR = LIS.getCachedRegUnit(RegUnit);
- if (LR == nullptr)
- return SafeDefault;
- return Property(*LR, Pos) ? LaneBitmask::getAll() : LaneBitmask::getNone();
+ return Result;
}
/// Mostly copy/paste from CodeGen/RegisterPressure.cpp
@@ -412,15 +413,15 @@ bool GCNRPTarget::isSaveBeneficial(Register Reg) const {
const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
+ RegExcess Excess(MF, RP, *this);
+
if (SRI->isSGPRClass(RC))
- return RP.getSGPRNum() > MaxSGPRs;
- unsigned NumVGPRs =
- SRI->isAGPRClass(RC) ? RP.getAGPRNum() : RP.getArchVGPRNum();
- // The addressable limit must always be respected.
- if (NumVGPRs > MaxVGPRs)
- return true;
- // For unified RFs, combined VGPR usage limit must be respected as well.
- return UnifiedRF && RP.getVGPRNum(true) > MaxUnifiedVGPRs;
+ return Excess.SGPR;
+
+ if (SRI->isAGPRClass(RC))
+ return (UnifiedRF && Excess.VGPR) || Excess.AGPR;
+
+ return (UnifiedRF && Excess.VGPR) || Excess.ArchVGPR;
}
bool GCNRPTarget::satisfied() const {
@@ -431,6 +432,11 @@ bool GCNRPTarget::satisfied() const {
return true;
}
+bool GCNRPTarget::hasVectorRegisterExcess() const {
+ RegExcess Excess(MF, RP, *this);
+ return Excess.hasVectorRegisterExcess();
+}
+
///////////////////////////////////////////////////////////////////////////////
// GCNRPTracker
@@ -502,10 +508,9 @@ void GCNRPTracker::reset(const MachineRegisterInfo &MRI_,
}
/// Mostly copy/paste from CodeGen/RegisterPressure.cpp
-LaneBitmask GCNRPTracker::getLastUsedLanes(Register RegUnit,
- SlotIndex Pos) const {
+LaneBitmask GCNRPTracker::getLastUsedLanes(Register Reg, SlotIndex Pos) const {
return getLanesWithProperty(
- LIS, *MRI, true, RegUnit, Pos.getBaseIndex(), LaneBitmask::getNone(),
+ LIS, *MRI, true, Reg, Pos.getBaseIndex(),
[](const LiveRange &LR, SlotIndex Pos) {
const LiveRange::Segment *S = LR.getSegmentContaining(Pos);
return S != nullptr && S->end == Pos.getRegSlot();
@@ -562,10 +567,10 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
SmallVector<VRegMaskOrUnit, 8> RegUses;
collectVirtualRegUses(RegUses, MI, LIS, *MRI);
for (const VRegMaskOrUnit &U : RegUses) {
- LaneBitmask &LiveMask = LiveRegs[U.RegUnit];
+ LaneBitmask &LiveMask = LiveRegs[U.VRegOrUnit.asVirtualReg()];
LaneBitmask PrevMask = LiveMask;
LiveMask |= U.LaneMask;
- CurPressure.inc(U.RegUnit, PrevMask, LiveMask, *MRI);
+ CurPressure.inc(U.VRegOrUnit.asVirtualReg(), PrevMask, LiveMask, *MRI);
}
// Update MaxPressure with uses plus early-clobber defs pressure.
@@ -580,7 +585,7 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
bool GCNDownwardRPTracker::reset(const MachineInstr &MI,
const LiveRegSet *LiveRegsCopy) {
- MRI = &MI.getParent()->getParent()->getRegInfo();
+ MRI = &MI.getMF()->getRegInfo();
LastTrackedMI = nullptr;
MBBEnd = MI.getParent()->end();
NextMI = &MI;
@@ -748,9 +753,9 @@ GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI,
GCNRegPressure TempPressure = CurPressure;
for (const VRegMaskOrUnit &Use : RegOpers.Uses) {
- Register Reg = Use.RegUnit;
- if (!Reg.isVirtual())
+ if (!Use.VRegOrUnit.isVirtualReg())
continue;
+ Register Reg = Use.VRegOrUnit.asVirtualReg();
LaneBitmask LastUseMask = getLastUsedLanes(Reg, SlotIdx);
if (LastUseMask.none())
continue;
@@ -782,9 +787,9 @@ GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI,
// Generate liveness for defs.
for (const VRegMaskOrUnit &Def : RegOpers.Defs) {
- Register Reg = Def.RegUnit;
- if (!Reg.isVirtual())
+ if (!Def.VRegOrUnit.isVirtualReg())
continue;
+ Register Reg = Def.VRegOrUnit.asVirtualReg();
auto It = LiveRegs.find(Reg);
LaneBitmask LiveMask = It != LiveRegs.end() ? It->second : LaneBitmask(0);
LaneBitmask NewMask = LiveMask | Def.LaneMask;
@@ -824,8 +829,7 @@ Printable llvm::print(const GCNRPTracker::LiveRegSet &LiveRegs,
Register Reg = Register::index2VirtReg(I);
auto It = LiveRegs.find(Reg);
if (It != LiveRegs.end() && It->second.any())
- OS << ' ' << printVRegOrUnit(Reg, TRI) << ':'
- << PrintLaneMask(It->second);
+ OS << ' ' << printReg(Reg, TRI) << ':' << PrintLaneMask(It->second);
}
OS << '\n';
});
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 979a8b0..c55796c 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -21,6 +21,7 @@
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/RegisterPressure.h"
#include <algorithm>
+#include <array>
namespace llvm {
@@ -45,7 +46,7 @@ struct GCNRegPressure {
return !Value[SGPR] && !Value[VGPR] && !Value[AGPR] && !Value[AVGPR];
}
- void clear() { std::fill(&Value[0], &Value[ValueArraySize], 0); }
+ void clear() { Value.fill(0); }
unsigned getNumRegs(RegKind Kind) const {
assert(Kind < TOTAL_KINDS);
@@ -101,6 +102,29 @@ struct GCNRegPressure {
DynamicVGPRBlockSize));
}
+ unsigned getVGPRSpills(MachineFunction &MF, unsigned ArchVGPRThreshold,
+ unsigned AGPRThreshold, unsigned CombinedThreshold) {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ if (!ST.hasGFX90AInsts())
+ return 0;
+
+ unsigned ArchPressure = getArchVGPRNum();
+ unsigned AGPRPressure = getAGPRNum();
+
+ unsigned ArchSpill = ArchPressure > ArchVGPRThreshold
+ ? (ArchPressure - ArchVGPRThreshold)
+ : 0;
+ unsigned AGPRSpill =
+ AGPRPressure > AGPRThreshold ? (AGPRPressure - AGPRThreshold) : 0;
+
+ unsigned UnifiedPressure = getVGPRNum(/*UnifiedVGPRFile=*/true);
+ unsigned UnifiedSpill = UnifiedPressure > CombinedThreshold
+ ? (UnifiedPressure - CombinedThreshold)
+ : 0;
+
+ return std::max(UnifiedSpill, ArchSpill + AGPRSpill);
+ }
+
void inc(unsigned Reg,
LaneBitmask PrevMask,
LaneBitmask NewMask,
@@ -127,9 +151,7 @@ struct GCNRegPressure {
bool less(const MachineFunction &MF, const GCNRegPressure &O,
unsigned MaxOccupancy = std::numeric_limits<unsigned>::max()) const;
- bool operator==(const GCNRegPressure &O) const {
- return std::equal(&Value[0], &Value[ValueArraySize], O.Value);
- }
+ bool operator==(const GCNRegPressure &O) const { return Value == O.Value; }
bool operator!=(const GCNRegPressure &O) const {
return !(*this == O);
@@ -160,7 +182,7 @@ private:
/// Pressure for all register kinds (first all regular registers kinds, then
/// all tuple register kinds).
- unsigned Value[ValueArraySize];
+ std::array<unsigned, ValueArraySize> Value;
static unsigned getRegKind(const TargetRegisterClass *RC,
const SIRegisterInfo *STI);
@@ -235,6 +257,12 @@ public:
/// Whether the current RP is at or below the defined pressure target.
bool satisfied() const;
+ bool hasVectorRegisterExcess() const;
+
+ unsigned getMaxSGPRs() const { return MaxSGPRs; }
+ unsigned getMaxVGPRs() const {
+ return UnifiedRF ? MaxUnifiedVGPRs : MaxVGPRs;
+ }
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
friend raw_ostream &operator<<(raw_ostream &OS, const GCNRPTarget &Target) {
@@ -293,7 +321,7 @@ protected:
/// Mostly copy/paste from CodeGen/RegisterPressure.cpp
void bumpDeadDefs(ArrayRef<VRegMaskOrUnit> DeadDefs);
- LaneBitmask getLastUsedLanes(Register RegUnit, SlotIndex Pos) const;
+ LaneBitmask getLastUsedLanes(Register Reg, SlotIndex Pos) const;
public:
// reset tracker and set live register set to the specified value.
@@ -456,7 +484,7 @@ template <typename Range>
DenseMap<MachineInstr*, GCNRPTracker::LiveRegSet>
getLiveRegMap(Range &&R, bool After, LiveIntervals &LIS) {
std::vector<SlotIndex> Indexes;
- Indexes.reserve(std::distance(R.begin(), R.end()));
+ Indexes.reserve(llvm::size(R));
auto &SII = *LIS.getSlotIndexes();
for (MachineInstr *I : R) {
auto SI = SII.getInstructionIndex(*I);
@@ -464,7 +492,7 @@ getLiveRegMap(Range &&R, bool After, LiveIntervals &LIS) {
}
llvm::sort(Indexes);
- auto &MRI = (*R.begin())->getParent()->getParent()->getRegInfo();
+ auto &MRI = (*R.begin())->getMF()->getRegInfo();
DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> LiveRegMap;
SmallVector<SlotIndex, 32> LiveIdxs, SRLiveIdxs;
for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
@@ -494,13 +522,13 @@ getLiveRegMap(Range &&R, bool After, LiveIntervals &LIS) {
inline GCNRPTracker::LiveRegSet getLiveRegsAfter(const MachineInstr &MI,
const LiveIntervals &LIS) {
return getLiveRegs(LIS.getInstructionIndex(MI).getDeadSlot(), LIS,
- MI.getParent()->getParent()->getRegInfo());
+ MI.getMF()->getRegInfo());
}
inline GCNRPTracker::LiveRegSet getLiveRegsBefore(const MachineInstr &MI,
const LiveIntervals &LIS) {
return getLiveRegs(LIS.getInstructionIndex(MI).getBaseIndex(), LIS,
- MI.getParent()->getParent()->getRegInfo());
+ MI.getMF()->getRegInfo());
}
template <typename Range>
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 9fbf9e5..b044195 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -28,10 +28,19 @@
#include "GCNRegPressure.h"
#include "SIMachineFunctionInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/CodeGen/CalcSpillWeights.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineCycleAnalysis.h"
+#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/RegisterClassInfo.h"
#include "llvm/MC/LaneBitmask.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/MC/MCSchedule.h"
+#include "llvm/MC/TargetRegistry.h"
#include "llvm/Support/ErrorHandling.h"
#define DEBUG_TYPE "machine-scheduler"
@@ -90,6 +99,10 @@ static cl::opt<bool> PrintMaxRPRegUsageAfterScheduler(
cl::init(false));
#endif
+static cl::opt<bool> DisableRewriteMFMAFormSchedStage(
+ "amdgpu-disable-rewrite-mfma-form-sched-stage", cl::Hidden,
+ cl::desc("Disable rewrie mfma rewrite scheduling stage"), cl::init(true));
+
const unsigned ScheduleMetrics::ScaleFactor = 100;
GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
@@ -145,7 +158,6 @@ void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
VGPRCriticalLimit -= std::min(VGPRLimitBias + ErrorMargin, VGPRCriticalLimit);
SGPRExcessLimit -= std::min(SGPRLimitBias + ErrorMargin, SGPRExcessLimit);
VGPRExcessLimit -= std::min(VGPRLimitBias + ErrorMargin, VGPRExcessLimit);
-
LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit
<< ", VGPRExcessLimit = " << VGPRExcessLimit
<< ", SGPRCriticalLimit = " << SGPRCriticalLimit
@@ -690,6 +702,8 @@ GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
const MachineSchedContext *C, bool IsLegacyScheduler)
: GCNSchedStrategy(C) {
SchedStages.push_back(GCNSchedStageID::OccInitialSchedule);
+ if (!DisableRewriteMFMAFormSchedStage)
+ SchedStages.push_back(GCNSchedStageID::RewriteMFMAForm);
SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule);
SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule);
SchedStages.push_back(GCNSchedStageID::PreRARematerialize);
@@ -946,6 +960,8 @@ GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
switch (SchedStageID) {
case GCNSchedStageID::OccInitialSchedule:
return std::make_unique<OccInitialScheduleStage>(SchedStageID, *this);
+ case GCNSchedStageID::RewriteMFMAForm:
+ return std::make_unique<RewriteMFMAFormStage>(SchedStageID, *this);
case GCNSchedStageID::UnclusteredHighRPReschedule:
return std::make_unique<UnclusteredHighRPStage>(SchedStageID, *this);
case GCNSchedStageID::ClusteredLowOccupancyReschedule:
@@ -970,6 +986,8 @@ void GCNScheduleDAGMILive::schedule() {
GCNRegPressure
GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
+ if (Regions[RegionIdx].first == Regions[RegionIdx].second)
+ return llvm::getRegPressure(MRI, LiveIns[RegionIdx]);
GCNDownwardRPTracker RPTracker(*LIS);
RPTracker.advance(Regions[RegionIdx].first, Regions[RegionIdx].second,
&LiveIns[RegionIdx]);
@@ -978,10 +996,8 @@ GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
static MachineInstr *getLastMIForRegion(MachineBasicBlock::iterator RegionBegin,
MachineBasicBlock::iterator RegionEnd) {
- auto REnd = RegionEnd == RegionBegin->getParent()->end()
- ? std::prev(RegionEnd)
- : RegionEnd;
- return &*skipDebugInstructionsBackward(REnd, RegionBegin);
+ assert(RegionBegin != RegionEnd && "Region must not be empty");
+ return &*skipDebugInstructionsBackward(std::prev(RegionEnd), RegionBegin);
}
void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
@@ -1076,9 +1092,12 @@ GCNScheduleDAGMILive::getRegionLiveOutMap() const {
assert(!Regions.empty());
std::vector<MachineInstr *> RegionLastMIs;
RegionLastMIs.reserve(Regions.size());
- for (auto &[RegionBegin, RegionEnd] : reverse(Regions))
+ for (auto &[RegionBegin, RegionEnd] : reverse(Regions)) {
+ // Skip empty regions.
+ if (RegionBegin == RegionEnd)
+ continue;
RegionLastMIs.push_back(getLastMIForRegion(RegionBegin, RegionEnd));
-
+ }
return getLiveRegMap(RegionLastMIs, /*After=*/true, *LIS);
}
@@ -1088,10 +1107,12 @@ void RegionPressureMap::buildLiveRegMap() {
RegionLiveRegMap =
IsLiveOut ? DAG->getRegionLiveOutMap() : DAG->getRegionLiveInMap();
for (unsigned I = 0; I < DAG->Regions.size(); I++) {
+ auto &[RegionBegin, RegionEnd] = DAG->Regions[I];
+ // Skip empty regions.
+ if (RegionBegin == RegionEnd)
+ continue;
MachineInstr *RegionKey =
- IsLiveOut
- ? getLastMIForRegion(DAG->Regions[I].first, DAG->Regions[I].second)
- : &*DAG->Regions[I].first;
+ IsLiveOut ? getLastMIForRegion(RegionBegin, RegionEnd) : &*RegionBegin;
IdxToInstruction[I] = RegionKey;
}
}
@@ -1160,6 +1181,8 @@ void GCNScheduleDAGMILive::runSchedStages() {
ScheduleDAGMILive::schedule();
Stage->finalizeGCNRegion();
+ Stage->advanceRegion();
+ exitRegion();
}
Stage->finalizeGCNSchedStage();
@@ -1180,6 +1203,9 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) {
case GCNSchedStageID::OccInitialSchedule:
OS << "Max Occupancy Initial Schedule";
break;
+ case GCNSchedStageID::RewriteMFMAForm:
+ OS << "Instruction Rewriting Reschedule";
+ break;
case GCNSchedStageID::UnclusteredHighRPReschedule:
OS << "Unclustered High Register Pressure Reschedule";
break;
@@ -1213,6 +1239,107 @@ bool GCNSchedStage::initGCNSchedStage() {
return true;
}
+void RewriteMFMAFormStage::findReachingDefs(
+ MachineOperand &UseMO, LiveIntervals *LIS,
+ SmallVectorImpl<SlotIndex> &DefIdxs) {
+ MachineInstr *UseMI = UseMO.getParent();
+ LiveInterval &UseLI = LIS->getInterval(UseMO.getReg());
+ VNInfo *VNI = UseLI.getVNInfoAt(LIS->getInstructionIndex(*UseMI));
+
+ // If the def is not a PHI, then it must be the only reaching def.
+ if (!VNI->isPHIDef()) {
+ DefIdxs.push_back(VNI->def);
+ return;
+ }
+
+ SmallPtrSet<MachineBasicBlock *, 8> Visited = {UseMI->getParent()};
+ SmallVector<MachineBasicBlock *, 8> Worklist;
+
+ // Mark the predecessor blocks for traversal
+ for (MachineBasicBlock *PredMBB : UseMI->getParent()->predecessors()) {
+ Worklist.push_back(PredMBB);
+ Visited.insert(PredMBB);
+ }
+
+ while (!Worklist.empty()) {
+ MachineBasicBlock *CurrMBB = Worklist.pop_back_val();
+
+ SlotIndex CurrMBBEnd = LIS->getMBBEndIdx(CurrMBB);
+ VNInfo *VNI = UseLI.getVNInfoAt(CurrMBBEnd.getPrevSlot());
+
+ MachineBasicBlock *DefMBB = LIS->getMBBFromIndex(VNI->def);
+
+ // If there is a def in this block, then add it to the list. This is the
+ // reaching def of this path.
+ if (!VNI->isPHIDef()) {
+ DefIdxs.push_back(VNI->def);
+ continue;
+ }
+
+ for (MachineBasicBlock *PredMBB : DefMBB->predecessors()) {
+ if (Visited.insert(PredMBB).second)
+ Worklist.push_back(PredMBB);
+ }
+ }
+}
+
+void RewriteMFMAFormStage::findReachingUses(
+ MachineInstr *DefMI, LiveIntervals *LIS,
+ SmallVectorImpl<MachineOperand *> &ReachingUses) {
+ SlotIndex DefIdx = LIS->getInstructionIndex(*DefMI);
+ for (MachineOperand &UseMO :
+ DAG.MRI.use_nodbg_operands(DefMI->getOperand(0).getReg())) {
+ SmallVector<SlotIndex, 8> ReachingDefIndexes;
+ findReachingDefs(UseMO, LIS, ReachingDefIndexes);
+
+ // If we find a use that contains this DefMI in its reachingDefs, then it is
+ // a reaching use.
+ if (any_of(ReachingDefIndexes, [DefIdx](SlotIndex RDIdx) {
+ return SlotIndex::isSameInstr(RDIdx, DefIdx);
+ }))
+ ReachingUses.push_back(&UseMO);
+ }
+}
+
+bool RewriteMFMAFormStage::initGCNSchedStage() {
+ // We only need to run this pass if the architecture supports AGPRs.
+ // Additionally, we don't use AGPRs at occupancy levels above 1 so there
+ // is no need for this pass in that case, either.
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ if (!ST.hasGFX90AInsts() || MFI.getMinWavesPerEU() > 1)
+ return false;
+
+ RegionsWithExcessArchVGPR.resize(DAG.Regions.size());
+ RegionsWithExcessArchVGPR.reset();
+ for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
+ GCNRegPressure PressureBefore = DAG.Pressure[Region];
+ if (PressureBefore.getArchVGPRNum() > ST.getAddressableNumArchVGPRs())
+ RegionsWithExcessArchVGPR[Region] = true;
+ }
+
+ if (RegionsWithExcessArchVGPR.none())
+ return false;
+
+ TII = ST.getInstrInfo();
+ SRI = ST.getRegisterInfo();
+
+ std::vector<std::pair<MachineInstr *, unsigned>> RewriteCands;
+ DenseMap<MachineBasicBlock *, std::set<Register>> CopyForUse;
+ SmallPtrSet<MachineInstr *, 8> CopyForDef;
+
+ if (!initHeuristics(RewriteCands, CopyForUse, CopyForDef))
+ return false;
+
+ int64_t Cost = getRewriteCost(RewriteCands, CopyForUse, CopyForDef);
+
+ // If we haven't found the beneficial conditions, prefer the VGPR form which
+ // may result in less cross RC copies.
+ if (Cost > 0)
+ return false;
+
+ return rewrite(RewriteCands);
+}
+
bool UnclusteredHighRPStage::initGCNSchedStage() {
if (DisableUnclusterHighRP)
return false;
@@ -1228,18 +1355,20 @@ bool UnclusteredHighRPStage::initGCNSchedStage() {
createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PreRAReentry));
InitialOccupancy = DAG.MinOccupancy;
- // Aggressivly try to reduce register pressure in the unclustered high RP
+ // Aggressively try to reduce register pressure in the unclustered high RP
// stage. Temporarily increase occupancy target in the region.
+ TempTargetOccupancy = MFI.getMaxWavesPerEU() > DAG.MinOccupancy
+ ? InitialOccupancy + 1
+ : InitialOccupancy;
+ IsAnyRegionScheduled = false;
S.SGPRLimitBias = S.HighRPSGPRBias;
S.VGPRLimitBias = S.HighRPVGPRBias;
- if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy)
- MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);
LLVM_DEBUG(
dbgs()
<< "Retrying function scheduling without clustering. "
- "Aggressivly try to reduce register pressure to achieve occupancy "
- << DAG.MinOccupancy << ".\n");
+ "Aggressively try to reduce register pressure to achieve occupancy "
+ << TempTargetOccupancy << ".\n");
return true;
}
@@ -1267,33 +1396,222 @@ bool ClusteredLowOccStage::initGCNSchedStage() {
#define REMAT_PREFIX "[PreRARemat] "
#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << REMAT_PREFIX; X;)
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+Printable PreRARematStage::ScoredRemat::print() const {
+ return Printable([&](raw_ostream &OS) {
+ OS << '(' << MaxFreq << ", " << FreqDiff << ", " << RegionImpact << ')';
+ });
+}
+#endif
+
bool PreRARematStage::initGCNSchedStage() {
// FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for
// regions inbetween the defs and region we sinked the def to. Will need to be
// fixed if there is another pass after this pass.
assert(!S.hasNextStage());
- if (!GCNSchedStage::initGCNSchedStage() || DAG.Regions.size() == 1)
+ if (!GCNSchedStage::initGCNSchedStage() || DAG.Regions.size() <= 1)
return false;
+ // Maps all MIs (except lone terminators, which are not part of any region) to
+ // their parent region. Non-lone terminators are considered part of the region
+ // they delimitate.
+ DenseMap<MachineInstr *, unsigned> MIRegion(MF.getInstructionCount());
+
// Before performing any IR modification record the parent region of each MI
// and the parent MBB of each region.
const unsigned NumRegions = DAG.Regions.size();
- RegionBB.reserve(NumRegions);
for (unsigned I = 0; I < NumRegions; ++I) {
RegionBoundaries Region = DAG.Regions[I];
for (auto MI = Region.first; MI != Region.second; ++MI)
MIRegion.insert({&*MI, I});
- RegionBB.push_back(Region.first->getParent());
+ MachineBasicBlock *ParentMBB = Region.first->getParent();
+ if (Region.second != ParentMBB->end())
+ MIRegion.insert({&*Region.second, I});
+ RegionBB.push_back(ParentMBB);
}
- if (!canIncreaseOccupancyOrReduceSpill())
+#ifndef NDEBUG
+ auto PrintTargetRegions = [&]() -> void {
+ if (TargetRegions.none()) {
+ dbgs() << REMAT_PREFIX << "No target regions\n";
+ return;
+ }
+ dbgs() << REMAT_PREFIX << "Target regions:\n";
+ for (unsigned I : TargetRegions.set_bits())
+ dbgs() << REMAT_PREFIX << " [" << I << "] " << RPTargets[I] << '\n';
+ };
+ auto PrintRematReg = [&](const RematReg &Remat) -> Printable {
+ return Printable([&, Remat](raw_ostream &OS) {
+ // Concatenate all region numbers in which the register is unused and
+ // live-through.
+ bool HasLiveThroughRegion = false;
+ OS << '[' << Remat.DefRegion << " -";
+ for (unsigned I = 0; I < NumRegions; ++I) {
+ if (Remat.isUnusedLiveThrough(I)) {
+ if (HasLiveThroughRegion) {
+ OS << ',';
+ } else {
+ OS << "- ";
+ HasLiveThroughRegion = true;
+ }
+ OS << I;
+ }
+ }
+ if (HasLiveThroughRegion)
+ OS << " -";
+ OS << "-> " << Remat.UseRegion << "] ";
+ Remat.DefMI->print(OS, /*IsStandalone=*/true, /*SkipOpers=*/false,
+ /*SkipDebugLoc=*/false, /*AddNewLine=*/false);
+ });
+ };
+#endif
+
+ // Set an objective for the stage based on current RP in each region.
+ REMAT_DEBUG({
+ dbgs() << "Analyzing ";
+ MF.getFunction().printAsOperand(dbgs(), false);
+ dbgs() << ": ";
+ });
+ if (!setObjective()) {
+ LLVM_DEBUG(dbgs() << "no objective to achieve, occupancy is maximal at "
+ << MFI.getMaxWavesPerEU() << '\n');
return false;
+ }
+ LLVM_DEBUG({
+ if (TargetOcc) {
+ dbgs() << "increase occupancy from " << *TargetOcc - 1 << '\n';
+ } else {
+ dbgs() << "reduce spilling (minimum target occupancy is "
+ << MFI.getMinWavesPerEU() << ")\n";
+ }
+ PrintTargetRegions();
+ });
+
+ if (!collectRematRegs(MIRegion)) {
+ REMAT_DEBUG(dbgs() << "No rematerializable registers\n");
+ return false;
+ }
+ const ScoredRemat::FreqInfo FreqInfo(MF, DAG);
+ REMAT_DEBUG({
+ dbgs() << "Rematerializable registers:\n";
+ for (const RematReg &Remat : RematRegs)
+ dbgs() << REMAT_PREFIX << " " << PrintRematReg(Remat) << '\n';
+ dbgs() << REMAT_PREFIX << "Region frequencies\n";
+ for (auto [I, Freq] : enumerate(FreqInfo.Regions)) {
+ dbgs() << REMAT_PREFIX << " [" << I << "] ";
+ if (Freq)
+ dbgs() << Freq;
+ else
+ dbgs() << "unknown ";
+ dbgs() << " | " << *DAG.Regions[I].first;
+ }
+ });
+
+ SmallVector<ScoredRemat> ScoredRemats;
+ for (RematReg &Remat : RematRegs)
+ ScoredRemats.emplace_back(&Remat, FreqInfo, DAG);
+
+// Rematerialize registers in successive rounds until all RP targets are
+// satisifed or until we run out of rematerialization candidates.
+#ifndef NDEBUG
+ unsigned RoundNum = 0;
+#endif
+ BitVector RecomputeRP(NumRegions);
+ do {
+ assert(!ScoredRemats.empty() && "no more remat candidates");
+
+ // (Re-)Score and (re-)sort all remats in increasing score order.
+ for (ScoredRemat &Remat : ScoredRemats)
+ Remat.update(TargetRegions, RPTargets, FreqInfo, !TargetOcc);
+ sort(ScoredRemats);
+
+ REMAT_DEBUG({
+ dbgs() << "==== ROUND " << RoundNum++ << " ====\n"
+ << REMAT_PREFIX
+ << "Candidates with non-null score, in rematerialization order:\n";
+ for (const ScoredRemat &RematDecision : reverse(ScoredRemats)) {
+ if (RematDecision.hasNullScore())
+ break;
+ dbgs() << REMAT_PREFIX << " " << RematDecision.print() << " | "
+ << *RematDecision.Remat->DefMI;
+ }
+ PrintTargetRegions();
+ });
+
+ RecomputeRP.reset();
+ unsigned RematIdx = ScoredRemats.size();
+
+ // Rematerialize registers in decreasing score order until we estimate
+ // that all RP targets are satisfied or until rematerialization candidates
+ // are no longer useful to decrease RP.
+ for (; RematIdx && TargetRegions.any(); --RematIdx) {
+ const ScoredRemat &Candidate = ScoredRemats[RematIdx - 1];
+ // Stop rematerializing on encountering a null score. Since scores
+ // monotonically decrease as we rematerialize, we know there is nothing
+ // useful left to do in such cases, even if we were to re-score.
+ if (Candidate.hasNullScore()) {
+ RematIdx = 0;
+ break;
+ }
+
+ RematReg &Remat = *Candidate.Remat;
+ // When previous rematerializations in this round have already satisfied
+ // RP targets in all regions this rematerialization can impact, we have a
+ // good indication that our scores have diverged significantly from
+ // reality, in which case we interrupt this round and re-score. This also
+ // ensures that every rematerialization we perform is possibly impactful
+ // in at least one target region.
+ if (!Remat.maybeBeneficial(TargetRegions, RPTargets))
+ break;
+
+ REMAT_DEBUG(dbgs() << "** REMAT " << PrintRematReg(Remat) << '\n';);
+ // Every rematerialization we do here is likely to move the instruction
+ // into a higher frequency region, increasing the total sum latency of the
+ // instruction itself. This is acceptable if we are eliminating a spill in
+ // the process, but when the goal is increasing occupancy we get nothing
+ // out of rematerialization if occupancy is not increased in the end; in
+ // such cases we want to roll back the rematerialization.
+ RollbackInfo *Rollback =
+ TargetOcc ? &Rollbacks.emplace_back(&Remat) : nullptr;
+ rematerialize(Remat, RecomputeRP, Rollback);
+ unsetSatisifedRPTargets(Remat.Live);
+ }
+
+ REMAT_DEBUG({
+ if (!TargetRegions.any()) {
+ dbgs() << "** Interrupt round on all targets achieved\n";
+ } else if (RematIdx) {
+ dbgs() << "** Interrupt round on stale score for "
+ << *ScoredRemats[RematIdx - 1].Remat->DefMI;
+ } else {
+ dbgs() << "** Stop on exhausted rematerialization candidates\n";
+ }
+ });
+
+ // Peel off registers we already rematerialized from the vector's tail.
+ ScoredRemats.truncate(RematIdx);
+ } while ((updateAndVerifyRPTargets(RecomputeRP) || TargetRegions.any()) &&
+ !ScoredRemats.empty());
+ if (RescheduleRegions.none())
+ return false;
+
+ // Commit all pressure changes to the DAG and compute minimum achieved
+ // occupancy in impacted regions.
+ REMAT_DEBUG(dbgs() << "==== REMAT RESULTS ====\n");
+ unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize();
+ for (unsigned I : RescheduleRegions.set_bits()) {
+ DAG.Pressure[I] = RPTargets[I].getCurrentRP();
+ REMAT_DEBUG(dbgs() << '[' << I << "] Achieved occupancy "
+ << DAG.Pressure[I].getOccupancy(ST, DynamicVGPRBlockSize)
+ << " (" << RPTargets[I] << ")\n");
+ }
+ AchievedOcc = MFI.getMaxWavesPerEU();
+ for (const GCNRegPressure &RP : DAG.Pressure) {
+ AchievedOcc =
+ std::min(AchievedOcc, RP.getOccupancy(ST, DynamicVGPRBlockSize));
+ }
- // Rematerialize identified instructions and update scheduler's state.
- rematerialize();
- if (GCNTrackers)
- DAG.RegionLiveOuts.buildLiveRegMap();
REMAT_DEBUG({
dbgs() << "Retrying function scheduling with new min. occupancy of "
<< AchievedOcc << " from rematerializing (original was "
@@ -1303,11 +1621,7 @@ bool PreRARematStage::initGCNSchedStage() {
dbgs() << ")\n";
});
- if (AchievedOcc > DAG.MinOccupancy) {
- DAG.MinOccupancy = AchievedOcc;
- SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
- MFI.increaseOccupancy(MF, DAG.MinOccupancy);
- }
+ DAG.setTargetOccupancy(getStageTargetOccupancy());
return true;
}
@@ -1320,15 +1634,26 @@ void UnclusteredHighRPStage::finalizeGCNSchedStage() {
SavedMutations.swap(DAG.Mutations);
S.SGPRLimitBias = S.VGPRLimitBias = 0;
if (DAG.MinOccupancy > InitialOccupancy) {
+ assert(IsAnyRegionScheduled);
LLVM_DEBUG(dbgs() << StageID
<< " stage successfully increased occupancy to "
<< DAG.MinOccupancy << '\n');
+ } else if (!IsAnyRegionScheduled) {
+ assert(DAG.MinOccupancy == InitialOccupancy);
+ LLVM_DEBUG(dbgs() << StageID
+ << ": No regions scheduled, min occupancy stays at "
+ << DAG.MinOccupancy << ", MFI occupancy stays at "
+ << MFI.getOccupancy() << ".\n");
}
GCNSchedStage::finalizeGCNSchedStage();
}
bool GCNSchedStage::initGCNRegion() {
+ // Skip empty scheduling region.
+ if (DAG.begin() == DAG.end())
+ return false;
+
// Check whether this new region is also a new block.
if (DAG.RegionBegin->getParent() != CurrentMBB)
setupNewBlock();
@@ -1336,8 +1661,8 @@ bool GCNSchedStage::initGCNRegion() {
unsigned NumRegionInstrs = std::distance(DAG.begin(), DAG.end());
DAG.enterRegion(CurrentMBB, DAG.begin(), DAG.end(), NumRegionInstrs);
- // Skip empty scheduling regions (0 or 1 schedulable instructions).
- if (DAG.begin() == DAG.end() || DAG.begin() == std::prev(DAG.end()))
+ // Skip regions with 1 schedulable instruction.
+ if (DAG.begin() == std::prev(DAG.end()))
return false;
LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
@@ -1396,13 +1721,25 @@ bool UnclusteredHighRPStage::initGCNRegion() {
// rescheduling of previous regions did not make occupancy drop back down to
// the initial minimum).
unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
+ // If no region has been scheduled yet, the DAG has not yet been updated with
+ // the occupancy target. So retrieve it from the temporary.
+ unsigned CurrentTargetOccupancy =
+ IsAnyRegionScheduled ? DAG.MinOccupancy : TempTargetOccupancy;
if (!DAG.RegionsWithExcessRP[RegionIdx] &&
- (DAG.MinOccupancy <= InitialOccupancy ||
+ (CurrentTargetOccupancy <= InitialOccupancy ||
DAG.Pressure[RegionIdx].getOccupancy(ST, DynamicVGPRBlockSize) !=
InitialOccupancy))
return false;
- return GCNSchedStage::initGCNRegion();
+ bool IsSchedulingThisRegion = GCNSchedStage::initGCNRegion();
+ // If this is the first region scheduled during this stage, make the target
+ // occupancy changes in the DAG and MFI.
+ if (!IsAnyRegionScheduled && IsSchedulingThisRegion) {
+ IsAnyRegionScheduled = true;
+ if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy)
+ DAG.setTargetOccupancy(TempTargetOccupancy);
+ }
+ return IsSchedulingThisRegion;
}
bool ClusteredLowOccStage::initGCNRegion() {
@@ -1447,9 +1784,23 @@ void GCNSchedStage::finalizeGCNRegion() {
if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
StageID != GCNSchedStageID::UnclusteredHighRPReschedule)
SavedMutations.swap(DAG.Mutations);
+}
- DAG.exitRegion();
- advanceRegion();
+void PreRARematStage::finalizeGCNRegion() {
+ GCNSchedStage::finalizeGCNRegion();
+ // When the goal is to increase occupancy, all regions must reach the target
+ // occupancy for rematerializations to be possibly useful, otherwise we will
+ // just hurt latency for no benefit. If minimum occupancy drops below the
+ // target there is no point in trying to re-schedule further regions.
+ if (!TargetOcc)
+ return;
+ RegionReverts.emplace_back(RegionIdx, Unsched, PressureBefore);
+ if (DAG.MinOccupancy < *TargetOcc) {
+ REMAT_DEBUG(dbgs() << "Region " << RegionIdx
+ << " cannot meet occupancy target, interrupting "
+ "re-scheduling in all regions\n");
+ RescheduleRegions.reset();
+ }
}
void GCNSchedStage::checkScheduling() {
@@ -1518,10 +1869,12 @@ void GCNSchedStage::checkScheduling() {
// Revert if this region's schedule would cause a drop in occupancy or
// spilling.
- if (shouldRevertScheduling(WavesAfter))
- revertScheduling();
- else
+ if (shouldRevertScheduling(WavesAfter)) {
+ modifyRegionSchedule(RegionIdx, DAG.BB, Unsched);
+ std::tie(DAG.RegionBegin, DAG.RegionEnd) = DAG.Regions[RegionIdx];
+ } else {
DAG.Pressure[RegionIdx] = PressureAfter;
+ }
}
unsigned
@@ -1723,8 +2076,9 @@ bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) {
}
bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) {
- return GCNSchedStage::shouldRevertScheduling(WavesAfter) ||
- mayCauseSpilling(WavesAfter) || (TargetOcc && WavesAfter < TargetOcc);
+ // When trying to increase occupancy (TargetOcc == true) the stage manages
+ // region reverts globally (all or none), so we always return false here.
+ return !TargetOcc && mayCauseSpilling(WavesAfter);
}
bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
@@ -1749,89 +2103,625 @@ bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
return false;
}
-void GCNSchedStage::revertScheduling() {
- LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
- DAG.RegionEnd = DAG.RegionBegin;
- int SkippedDebugInstr = 0;
- for (MachineInstr *MI : Unsched) {
- if (MI->isDebugInstr()) {
- ++SkippedDebugInstr;
- continue;
- }
+void GCNSchedStage::modifyRegionSchedule(unsigned RegionIdx,
+ MachineBasicBlock *MBB,
+ ArrayRef<MachineInstr *> MIOrder) {
+ assert(static_cast<size_t>(std::distance(DAG.Regions[RegionIdx].first,
+ DAG.Regions[RegionIdx].second)) ==
+ MIOrder.size() &&
+ "instruction number mismatch");
+ if (MIOrder.empty())
+ return;
- if (MI->getIterator() != DAG.RegionEnd) {
- DAG.BB->splice(DAG.RegionEnd, DAG.BB, MI);
- if (!MI->isDebugInstr())
+ LLVM_DEBUG(dbgs() << "Reverting scheduling for region " << RegionIdx << '\n');
+
+ // Reconstruct MI sequence by moving instructions in desired order before
+ // the current region's start.
+ MachineBasicBlock::iterator RegionEnd = DAG.Regions[RegionIdx].first;
+ for (MachineInstr *MI : MIOrder) {
+ // Either move the next MI in order before the end of the region or move the
+ // region end past the MI if it is at the correct position.
+ MachineBasicBlock::iterator MII = MI->getIterator();
+ if (MII != RegionEnd) {
+ // Will subsequent splice move MI up past a non-debug instruction?
+ bool NonDebugReordered =
+ !MI->isDebugInstr() &&
+ skipDebugInstructionsForward(RegionEnd, MII) != MII;
+ MBB->splice(RegionEnd, MBB, MI);
+ // Only update LiveIntervals information if non-debug instructions are
+ // reordered. Otherwise debug instructions could cause code generation to
+ // change.
+ if (NonDebugReordered)
DAG.LIS->handleMove(*MI, true);
+ } else {
+ ++RegionEnd;
+ }
+ if (MI->isDebugInstr()) {
+ LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
+ continue;
}
// Reset read-undef flags and update them later.
- for (auto &Op : MI->all_defs())
+ for (MachineOperand &Op : MI->all_defs())
Op.setIsUndef(false);
RegisterOperands RegOpers;
RegOpers.collect(*MI, *DAG.TRI, DAG.MRI, DAG.ShouldTrackLaneMasks, false);
- if (!MI->isDebugInstr()) {
- if (DAG.ShouldTrackLaneMasks) {
- // Adjust liveness and add missing dead+read-undef flags.
- SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(*MI).getRegSlot();
- RegOpers.adjustLaneLiveness(*DAG.LIS, DAG.MRI, SlotIdx, MI);
- } else {
- // Adjust for missing dead-def flags.
- RegOpers.detectDeadDefs(*MI, *DAG.LIS);
- }
+ if (DAG.ShouldTrackLaneMasks) {
+ // Adjust liveness and add missing dead+read-undef flags.
+ SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(*MI).getRegSlot();
+ RegOpers.adjustLaneLiveness(*DAG.LIS, DAG.MRI, SlotIdx, MI);
+ } else {
+ // Adjust for missing dead-def flags.
+ RegOpers.detectDeadDefs(*MI, *DAG.LIS);
}
- DAG.RegionEnd = MI->getIterator();
- ++DAG.RegionEnd;
LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
}
- // After reverting schedule, debug instrs will now be at the end of the block
- // and RegionEnd will point to the first debug instr. Increment RegionEnd
- // pass debug instrs to the actual end of the scheduling region.
- while (SkippedDebugInstr-- > 0)
- ++DAG.RegionEnd;
+ // The region end doesn't change throughout scheduling since it itself is
+ // outside the region (whether that is a MBB end or a terminator MI).
+ assert(RegionEnd == DAG.Regions[RegionIdx].second && "region end mismatch");
+ DAG.Regions[RegionIdx].first = MIOrder.front();
+}
+
+bool RewriteMFMAFormStage::isRewriteCandidate(MachineInstr *MI) const {
- // If Unsched.front() instruction is a debug instruction, this will actually
- // shrink the region since we moved all debug instructions to the end of the
- // block. Find the first instruction that is not a debug instruction.
- DAG.RegionBegin = Unsched.front()->getIterator();
- if (DAG.RegionBegin->isDebugInstr()) {
- for (MachineInstr *MI : Unsched) {
- if (MI->isDebugInstr())
+ if (!static_cast<const SIInstrInfo *>(DAG.TII)->isMAI(*MI))
+ return false;
+ return AMDGPU::getMFMASrcCVDstAGPROp(MI->getOpcode()) != -1;
+}
+
+bool RewriteMFMAFormStage::initHeuristics(
+ std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
+ DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
+ SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
+ bool Changed = false;
+
+ // Prepare for the heuristics
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ if (!isRewriteCandidate(&MI))
continue;
- DAG.RegionBegin = MI->getIterator();
- break;
+
+ int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode());
+ assert(ReplacementOp != -1);
+
+ RewriteCands.push_back({&MI, MI.getOpcode()});
+ MI.setDesc(TII->get(ReplacementOp));
+
+ MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+ if (Src2->isReg()) {
+ SmallVector<SlotIndex, 8> Src2ReachingDefs;
+ findReachingDefs(*Src2, DAG.LIS, Src2ReachingDefs);
+
+ // For any definition of the src2 register which is non-MFMA, we
+ // insert a copy.
+ for (SlotIndex RDIdx : Src2ReachingDefs) {
+ MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIdx);
+ if (!TII->isMAI(*RD))
+ CopyForDef.insert(RD);
+ }
+ }
+
+ MachineOperand &Dst = MI.getOperand(0);
+ SmallVector<MachineOperand *, 8> DstReachingUses;
+
+ findReachingUses(&MI, DAG.LIS, DstReachingUses);
+
+ for (MachineOperand *RUOp : DstReachingUses) {
+ if (TII->isMAI(*RUOp->getParent()))
+ continue;
+
+ // For any user of the result of the MFMA which is not an MFMA, we
+ // insert a copy. For a given register, we will only insert one copy
+ // per user block.
+ CopyForUse[RUOp->getParent()->getParent()].insert(RUOp->getReg());
+
+ SmallVector<SlotIndex, 8> DstUsesReachingDefs;
+ findReachingDefs(*RUOp, DAG.LIS, DstUsesReachingDefs);
+
+ for (SlotIndex RDIndex : DstUsesReachingDefs) {
+ MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
+ if (TII->isMAI(*RD))
+ continue;
+
+ // For any definition of the user of the MFMA which is not an MFMA,
+ // we insert a copy. We do this to transform all the reaching defs
+ // of this use to AGPR. By doing this, we can insert a copy from
+ // AGPR to VGPR at the user rather than after the MFMA.
+ CopyForDef.insert(RD);
+ }
+ }
+
+ // Do the rewrite to allow for updated RP calculation.
+ const TargetRegisterClass *VGPRRC = DAG.MRI.getRegClass(Dst.getReg());
+ const TargetRegisterClass *AGPRRC = SRI->getEquivalentAGPRClass(VGPRRC);
+ DAG.MRI.setRegClass(Dst.getReg(), AGPRRC);
+ if (Src2->isReg())
+ DAG.MRI.setRegClass(Src2->getReg(), AGPRRC);
+ Changed = true;
}
}
- // Then move the debug instructions back into their correct place and set
- // RegionBegin and RegionEnd if needed.
- DAG.placeDebugValues();
+ return Changed;
+}
- DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
+int64_t RewriteMFMAFormStage::getRewriteCost(
+ const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
+ const DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
+ const SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
+ MachineBlockFrequencyInfo *MBFI = DAG.MBFI;
+
+ int64_t BestSpillCost = 0;
+ int64_t Cost = 0;
+ uint64_t EntryFreq = MBFI->getEntryFreq().getFrequency();
+
+ std::pair<unsigned, unsigned> MaxVectorRegs =
+ ST.getMaxNumVectorRegs(MF.getFunction());
+ unsigned ArchVGPRThreshold = MaxVectorRegs.first;
+ unsigned AGPRThreshold = MaxVectorRegs.second;
+ unsigned CombinedThreshold = ST.getMaxNumVGPRs(MF);
+
+ for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
+ if (!RegionsWithExcessArchVGPR[Region])
+ continue;
+
+ GCNRegPressure &PressureBefore = DAG.Pressure[Region];
+ unsigned SpillCostBefore = PressureBefore.getVGPRSpills(
+ MF, ArchVGPRThreshold, AGPRThreshold, CombinedThreshold);
+
+ // For the cases we care about (i.e. ArchVGPR usage is greater than the
+ // addressable limit), rewriting alone should bring pressure to manageable
+ // level. If we find any such region, then the rewrite is potentially
+ // beneficial.
+ GCNRegPressure PressureAfter = DAG.getRealRegPressure(Region);
+ unsigned SpillCostAfter = PressureAfter.getVGPRSpills(
+ MF, ArchVGPRThreshold, AGPRThreshold, CombinedThreshold);
+
+ uint64_t BlockFreq =
+ MBFI->getBlockFreq(DAG.Regions[Region].first->getParent())
+ .getFrequency();
+
+ bool RelativeFreqIsDenom = EntryFreq > BlockFreq;
+ uint64_t RelativeFreq = EntryFreq && BlockFreq
+ ? (RelativeFreqIsDenom ? EntryFreq / BlockFreq
+ : BlockFreq / EntryFreq)
+ : 1;
+
+ // This assumes perfect spilling / splitting -- using one spill / copy
+ // instruction and one restoreFrom / copy for each excess register,
+ int64_t SpillCost = ((int)SpillCostAfter - (int)SpillCostBefore) * 2;
+
+ // Also account for the block frequency.
+ if (RelativeFreqIsDenom)
+ SpillCost /= (int64_t)RelativeFreq;
+ else
+ SpillCost *= (int64_t)RelativeFreq;
+
+ // If we have increased spilling in any block, just bail.
+ if (SpillCost > 0)
+ return SpillCost;
+
+ if (SpillCost < BestSpillCost)
+ BestSpillCost = SpillCost;
+ }
+
+ // Set the cost to the largest decrease in spill cost in order to not double
+ // count spill reductions.
+ Cost = BestSpillCost;
+ assert(Cost <= 0);
+
+ unsigned CopyCost = 0;
+
+ // For each CopyForDef, increase the cost by the register size while
+ // accounting for block frequency.
+ for (MachineInstr *DefMI : CopyForDef) {
+ Register DefReg = DefMI->getOperand(0).getReg();
+ uint64_t DefFreq =
+ EntryFreq
+ ? MBFI->getBlockFreq(DefMI->getParent()).getFrequency() / EntryFreq
+ : 1;
+
+ const TargetRegisterClass *RC = DAG.MRI.getRegClass(DefReg);
+ CopyCost += RC->getCopyCost() * DefFreq;
+ }
+
+ // Account for CopyForUse copies in each block that the register is used.
+ for (auto &[UseBlock, UseRegs] : CopyForUse) {
+ uint64_t UseFreq =
+ EntryFreq ? MBFI->getBlockFreq(UseBlock).getFrequency() / EntryFreq : 1;
+
+ for (Register UseReg : UseRegs) {
+ const TargetRegisterClass *RC = DAG.MRI.getRegClass(UseReg);
+ CopyCost += RC->getCopyCost() * UseFreq;
+ }
+ }
+
+ // Reset the classes that were changed to AGPR for better RB analysis.
+ // We must do rewriting after copy-insertion, as some defs of the register
+ // may require VGPR. Additionally, if we bail out and don't perform the
+ // rewrite then these need to be restored anyway.
+ for (auto &[MI, OriginalOpcode] : RewriteCands) {
+ assert(TII->isMAI(*MI));
+ const TargetRegisterClass *AGPRRC =
+ DAG.MRI.getRegClass(MI->getOperand(0).getReg());
+ const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(AGPRRC);
+
+ MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
+ assert(Src2);
+
+ if (Src2->isReg())
+ DAG.MRI.setRegClass(Src2->getReg(), VGPRRC);
+ DAG.MRI.setRegClass(MI->getOperand(0).getReg(), VGPRRC);
+ MI->setDesc(TII->get(OriginalOpcode));
+ }
+
+ return Cost + CopyCost;
}
-bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
+bool RewriteMFMAFormStage::rewrite(
+ const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands) {
+ DenseMap<MachineInstr *, unsigned> FirstMIToRegion;
+ DenseMap<MachineInstr *, unsigned> LastMIToRegion;
+
+ for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
+ RegionBoundaries Entry = DAG.Regions[Region];
+ if (Entry.first == Entry.second)
+ continue;
+
+ FirstMIToRegion[&*Entry.first] = Region;
+ if (Entry.second != Entry.first->getParent()->end())
+ LastMIToRegion[&*Entry.second] = Region;
+ }
+
+ // Rewrite the MFMAs to AGPR, and insert any copies as needed.
+ // The general assumption of the algorithm (and the previous cost calculation)
+ // is that it is better to insert the copies in the MBB of the def of the src2
+ // operands, and in the MBB of the user of the dest operands. This is based on
+ // the assumption that the MFMAs are likely to appear in loop bodies, while
+ // the src2 and dest operands are live-in / live-out of the loop. Due to this
+ // design, the algorithm for finding copy insertion points is more
+ // complicated.
+ //
+ // There are three main cases to handle: 1. the reaching defs of the src2
+ // operands, 2. the reaching uses of the dst operands, and 3. the reaching
+ // defs of the reaching uses of the dst operand.
+ //
+ // In the first case, we simply insert copies after each of the reaching
+ // definitions. In the second case, we collect all the uses of a given dest
+ // and organize them by MBB. Then, we insert 1 copy for each MBB before the
+ // earliest use. Since the use may have multiple reaching defs, and since we
+ // want to replace the register it is using with the result of the copy, we
+ // must handle case 3. In the third case, we simply insert a copy after each
+ // of the reaching defs to connect to the copy of the reaching uses of the dst
+ // reg. This allows us to avoid inserting copies next to the MFMAs.
+ //
+ // While inserting the copies, we maintain a map of operands which will use
+ // different regs (i.e. the result of the copies). For example, a case 1 src2
+ // operand will use the register result of the copies after the reaching defs,
+ // as opposed to the original register. Now that we have completed our copy
+ // analysis and placement, we can bulk update the registers. We do this
+ // separately as to avoid complicating the reachingDef and reachingUse
+ // queries.
+ //
+ // While inserting the copies, we also maintain a list or registers which we
+ // will want to reclassify as AGPR. After doing the copy insertion and the
+ // register replacement, we can finally do the reclassification. This uses the
+ // redef map, as the registers we are interested in reclassifying may be
+ // replaced by the result of a copy. We must do this after the copy analysis
+ // and placement as we must have an accurate redef map -- otherwise we may end
+ // up creating illegal instructions.
+
+ // The original registers of the MFMA that need to be reclassified as AGPR.
+ DenseSet<Register> RewriteRegs;
+ // The map of an original register in the MFMA to a new register (result of a
+ // copy) that it should be replaced with.
+ DenseMap<Register, Register> RedefMap;
+ // The map of the original MFMA registers to the relevant MFMA operands.
+ DenseMap<Register, DenseSet<MachineOperand *>> ReplaceMap;
+ // The map of reaching defs for a given register -- to avoid duplicate copies.
+ DenseMap<Register, SmallPtrSet<MachineInstr *, 8>> ReachingDefCopyMap;
+ // The map of reaching uses for a given register by basic block -- to avoid
+ // duplicate copies and to calculate per MBB insert pts.
+ DenseMap<unsigned, DenseMap<Register, SmallPtrSet<MachineOperand *, 8>>>
+ ReachingUseTracker;
+
+ for (auto &[MI, OriginalOpcode] : RewriteCands) {
+ int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI->getOpcode());
+ if (ReplacementOp == -1)
+ continue;
+ MI->setDesc(TII->get(ReplacementOp));
+
+ // Case 1: insert copies for the reaching defs of the Src2Reg.
+ MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
+ if (Src2->isReg()) {
+ Register Src2Reg = Src2->getReg();
+ if (!Src2Reg.isVirtual())
+ return false;
+
+ Register MappedReg = Src2->getReg();
+ SmallVector<SlotIndex, 8> Src2ReachingDefs;
+ findReachingDefs(*Src2, DAG.LIS, Src2ReachingDefs);
+ SmallSetVector<MachineInstr *, 8> Src2DefsReplace;
+
+ for (SlotIndex RDIndex : Src2ReachingDefs) {
+ MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
+ if (TII->isMAI(*RD))
+ continue;
+
+ // If there is a non mai reaching def, then we need a copy.
+ Src2DefsReplace.insert(RD);
+ }
+
+ if (!Src2DefsReplace.empty()) {
+ DenseMap<Register, Register>::iterator RI = RedefMap.find(Src2Reg);
+ if (RI != RedefMap.end()) {
+ MappedReg = RI->second;
+ } else {
+ assert(!ReachingDefCopyMap.contains(Src2Reg));
+ const TargetRegisterClass *Src2RC = DAG.MRI.getRegClass(Src2Reg);
+ const TargetRegisterClass *VGPRRC =
+ SRI->getEquivalentVGPRClass(Src2RC);
+
+ // Track the mapping of the original register to the new register.
+ MappedReg = DAG.MRI.createVirtualRegister(VGPRRC);
+ RedefMap[Src2Reg] = MappedReg;
+ }
+
+ // If none exists, create a copy from this reaching def.
+ // We may have inserted a copy already in an earlier iteration.
+ for (MachineInstr *RD : Src2DefsReplace) {
+ // Do not create redundant copies.
+ if (ReachingDefCopyMap[Src2Reg].insert(RD).second) {
+ MachineInstrBuilder VGPRCopy =
+ BuildMI(*RD->getParent(), std::next(RD->getIterator()),
+ RD->getDebugLoc(), TII->get(TargetOpcode::COPY))
+ .addDef(MappedReg, {}, 0)
+ .addUse(Src2Reg, {}, 0);
+ DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
+
+ // If this reaching def was the last MI in the region, update the
+ // region boundaries.
+ if (LastMIToRegion.contains(RD)) {
+ unsigned UpdateRegion = LastMIToRegion[RD];
+ DAG.Regions[UpdateRegion].second = VGPRCopy;
+ LastMIToRegion.erase(RD);
+ }
+ }
+ }
+ }
+
+ // Track the register for reclassification
+ RewriteRegs.insert(Src2Reg);
+
+ // Always insert the operand for replacement. If this corresponds with a
+ // chain of tied-def we may not see the VGPR requirement until later.
+ ReplaceMap[Src2Reg].insert(Src2);
+ }
+
+ // Case 2 and Case 3: insert copies before the reaching uses of the dsts,
+ // and after the reaching defs of the reaching uses of the dsts.
+
+ MachineOperand *Dst = &MI->getOperand(0);
+ Register DstReg = Dst->getReg();
+ if (!DstReg.isVirtual())
+ return false;
+
+ Register MappedReg = DstReg;
+ SmallVector<MachineOperand *, 8> DstReachingUses;
+
+ SmallVector<MachineOperand *, 8> DstReachingUseCopies;
+ SmallVector<MachineInstr *, 8> DstUseDefsReplace;
+
+ findReachingUses(MI, DAG.LIS, DstReachingUses);
+
+ for (MachineOperand *RUOp : DstReachingUses) {
+ if (TII->isMAI(*RUOp->getParent()))
+ continue;
+
+ // If there is a non mai reaching use, then we need a copy.
+ if (find(DstReachingUseCopies, RUOp) == DstReachingUseCopies.end())
+ DstReachingUseCopies.push_back(RUOp);
+ SmallVector<SlotIndex, 8> DstUsesReachingDefs;
+ findReachingDefs(*RUOp, DAG.LIS, DstUsesReachingDefs);
+
+ for (SlotIndex RDIndex : DstUsesReachingDefs) {
+ MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
+ if (TII->isMAI(*RD))
+ continue;
+
+ // If there is a non mai reaching def of this reaching use, then we will
+ // need a copy.
+ if (find(DstUseDefsReplace, RD) == DstUseDefsReplace.end())
+ DstUseDefsReplace.push_back(RD);
+ }
+ }
+
+ if (!DstUseDefsReplace.empty()) {
+ DenseMap<Register, Register>::iterator RI = RedefMap.find(DstReg);
+ if (RI != RedefMap.end()) {
+ MappedReg = RI->second;
+ } else {
+ assert(!ReachingDefCopyMap.contains(DstReg));
+ const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(DstReg);
+ const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(DstRC);
+
+ // Track the mapping of the original register to the new register.
+ MappedReg = DAG.MRI.createVirtualRegister(VGPRRC);
+ RedefMap[DstReg] = MappedReg;
+ }
+
+ // If none exists, create a copy from this reaching def.
+ // We may have inserted a copy already in an earlier iteration.
+ for (MachineInstr *RD : DstUseDefsReplace) {
+ // Do not create reundant copies.
+ if (ReachingDefCopyMap[DstReg].insert(RD).second) {
+ MachineInstrBuilder VGPRCopy =
+ BuildMI(*RD->getParent(), std::next(RD->getIterator()),
+ RD->getDebugLoc(), TII->get(TargetOpcode::COPY))
+ .addDef(MappedReg, {}, 0)
+ .addUse(DstReg, {}, 0);
+ DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
+
+ // If this reaching def was the last MI in the region, update the
+ // region boundaries.
+ DenseMap<MachineInstr *, unsigned>::iterator LMI =
+ LastMIToRegion.find(RD);
+ if (LMI != LastMIToRegion.end()) {
+ unsigned UpdateRegion = LMI->second;
+ DAG.Regions[UpdateRegion].second = VGPRCopy;
+ LastMIToRegion.erase(RD);
+ }
+ }
+ }
+ }
+
+ DenseSet<MachineOperand *> &DstRegSet = ReplaceMap[DstReg];
+ for (MachineOperand *RU : DstReachingUseCopies) {
+ MachineBasicBlock *RUBlock = RU->getParent()->getParent();
+ // Just keep track of the reaching use of this register by block. After we
+ // have scanned all the MFMAs we can find optimal insert pts.
+ if (RUBlock != MI->getParent()) {
+ ReachingUseTracker[RUBlock->getNumber()][DstReg].insert(RU);
+ continue;
+ }
+
+ // Special case, the use is in the same block as the MFMA. Insert the copy
+ // just before the use.
+ const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(DstReg);
+ const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(DstRC);
+ Register NewUseReg = DAG.MRI.createVirtualRegister(VGPRRC);
+ MachineInstr *UseInst = RU->getParent();
+ MachineInstrBuilder VGPRCopy =
+ BuildMI(*UseInst->getParent(), UseInst->getIterator(),
+ UseInst->getDebugLoc(), TII->get(TargetOpcode::COPY))
+ .addDef(NewUseReg, {}, 0)
+ .addUse(DstReg, {}, 0);
+ DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
+ // Since we know this use has only one reaching def, we can replace the
+ // use reg.
+ RU->setReg(NewUseReg);
+ // Track the copy source operand for r eplacement.
+ DstRegSet.insert(&VGPRCopy->getOperand(1));
+ }
+
+ // Track the register for reclassification
+ RewriteRegs.insert(DstReg);
+
+ // Insert the dst operand for replacement. If this dst is in a chain of
+ // tied-def MFMAs, and the first src2 needs to be replaced with a new reg,
+ // all the correspond operands need to be replaced.
+ DstRegSet.insert(Dst);
+ }
+
+ // Handle the copies for dst uses.
+ using RUBType =
+ std::pair<unsigned, DenseMap<Register, SmallPtrSet<MachineOperand *, 8>>>;
+ for (RUBType RUBlockEntry : ReachingUseTracker) {
+ using RUDType = std::pair<Register, SmallPtrSet<MachineOperand *, 8>>;
+ for (RUDType RUDst : RUBlockEntry.second) {
+ MachineOperand *OpBegin = *RUDst.second.begin();
+ SlotIndex InstPt = DAG.LIS->getInstructionIndex(*OpBegin->getParent());
+
+ // Find the earliest use in this block.
+ for (MachineOperand *User : RUDst.second) {
+ SlotIndex NewInstPt = DAG.LIS->getInstructionIndex(*User->getParent());
+ if (SlotIndex::isEarlierInstr(NewInstPt, InstPt))
+ InstPt = NewInstPt;
+ }
+
+ const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(RUDst.first);
+ const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(DstRC);
+ Register NewUseReg = DAG.MRI.createVirtualRegister(VGPRRC);
+ MachineInstr *UseInst = DAG.LIS->getInstructionFromIndex(InstPt);
+
+ MachineInstrBuilder VGPRCopy =
+ BuildMI(*UseInst->getParent(), UseInst->getIterator(),
+ UseInst->getDebugLoc(), TII->get(TargetOpcode::COPY))
+ .addDef(NewUseReg, {}, 0)
+ .addUse(RUDst.first, {}, 0);
+ DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
+
+ // If this UseInst was the first MI in the region, update the region
+ // boundaries.
+ DenseMap<MachineInstr *, unsigned>::iterator FI =
+ FirstMIToRegion.find(UseInst);
+ if (FI != FirstMIToRegion.end()) {
+ unsigned UpdateRegion = FI->second;
+ DAG.Regions[UpdateRegion].first = VGPRCopy;
+ FirstMIToRegion.erase(UseInst);
+ }
+
+ // Replace the operand for all users.
+ for (MachineOperand *User : RUDst.second) {
+ User->setReg(NewUseReg);
+ }
+
+ // Track the copy source operand for replacement.
+ ReplaceMap[RUDst.first].insert(&VGPRCopy->getOperand(1));
+ }
+ }
+
+ // We may have needed to insert copies after the reaching defs of the MFMAs.
+ // Replace the original register with the result of the copy for all relevant
+ // operands.
+ for (std::pair<Register, Register> NewDef : RedefMap) {
+ Register OldReg = NewDef.first;
+ Register NewReg = NewDef.second;
+
+ // Replace the register for any associated operand in the MFMA chain.
+ for (MachineOperand *ReplaceOp : ReplaceMap[OldReg])
+ ReplaceOp->setReg(NewReg);
+ }
+
+ // Finally, do the reclassification of the MFMA registers.
+ for (Register RewriteReg : RewriteRegs) {
+ Register RegToRewrite = RewriteReg;
+
+ // Be sure to update the replacement register and not the original.
+ DenseMap<Register, Register>::iterator RI = RedefMap.find(RewriteReg);
+ if (RI != RedefMap.end())
+ RegToRewrite = RI->second;
+
+ const TargetRegisterClass *CurrRC = DAG.MRI.getRegClass(RegToRewrite);
+ const TargetRegisterClass *AGPRRC = SRI->getEquivalentAGPRClass(CurrRC);
+
+ DAG.MRI.setRegClass(RegToRewrite, AGPRRC);
+ }
+
+ // Bulk update the LIS.
+ DAG.LIS->reanalyze(DAG.MF);
+ // Liveins may have been modified for cross RC copies
+ RegionPressureMap LiveInUpdater(&DAG, false);
+ LiveInUpdater.buildLiveRegMap();
+
+ for (unsigned Region = 0; Region < DAG.Regions.size(); Region++)
+ DAG.LiveIns[Region] = LiveInUpdater.getLiveRegsForRegionIdx(Region);
+
+ DAG.Pressure[RegionIdx] = DAG.getRealRegPressure(RegionIdx);
+
+ return true;
+}
+
+unsigned PreRARematStage::getStageTargetOccupancy() const {
+ return TargetOcc ? *TargetOcc : MFI.getMinWavesPerEU();
+}
+
+bool PreRARematStage::setObjective() {
const Function &F = MF.getFunction();
- // Maps optimizable regions (i.e., regions at minimum and register-limited
- // occupancy, or regions with spilling) to the target RP we would like to
- // reach.
- DenseMap<unsigned, GCNRPTarget> OptRegions;
+ // Set up "spilling targets" for all regions.
unsigned MaxSGPRs = ST.getMaxNumSGPRs(F);
unsigned MaxVGPRs = ST.getMaxNumVGPRs(F);
- auto ResetTargetRegions = [&]() {
- OptRegions.clear();
- for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
- const GCNRegPressure &RP = DAG.Pressure[I];
- GCNRPTarget Target(MaxSGPRs, MaxVGPRs, MF, RP);
- if (!Target.satisfied())
- OptRegions.insert({I, Target});
- }
- };
+ bool HasVectorRegisterExcess = false;
+ for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+ const GCNRegPressure &RP = DAG.Pressure[I];
+ GCNRPTarget &Target = RPTargets.emplace_back(MaxSGPRs, MaxVGPRs, MF, RP);
+ if (!Target.satisfied())
+ TargetRegions.set(I);
+ HasVectorRegisterExcess |= Target.hasVectorRegisterExcess();
+ }
- ResetTargetRegions();
- if (!OptRegions.empty() || DAG.MinOccupancy >= MFI.getMaxWavesPerEU()) {
+ if (HasVectorRegisterExcess || DAG.MinOccupancy >= MFI.getMaxWavesPerEU()) {
// In addition to register usage being above addressable limits, occupancy
// below the minimum is considered like "spilling" as well.
TargetOcc = std::nullopt;
@@ -1839,94 +2729,68 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
// There is no spilling and room to improve occupancy; set up "increased
// occupancy targets" for all regions.
TargetOcc = DAG.MinOccupancy + 1;
- unsigned VGPRBlockSize =
- MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
+ const unsigned VGPRBlockSize = MFI.getDynamicVGPRBlockSize();
MaxSGPRs = ST.getMaxNumSGPRs(*TargetOcc, false);
MaxVGPRs = ST.getMaxNumVGPRs(*TargetOcc, VGPRBlockSize);
- ResetTargetRegions();
- }
- REMAT_DEBUG({
- dbgs() << "Analyzing ";
- MF.getFunction().printAsOperand(dbgs(), false);
- dbgs() << ": ";
- if (OptRegions.empty()) {
- dbgs() << "no objective to achieve, occupancy is maximal at "
- << MFI.getMaxWavesPerEU();
- } else if (!TargetOcc) {
- dbgs() << "reduce spilling (minimum target occupancy is "
- << MFI.getMinWavesPerEU() << ')';
- } else {
- dbgs() << "increase occupancy from " << DAG.MinOccupancy << " to "
- << TargetOcc;
- }
- dbgs() << '\n';
- for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
- if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end()) {
- dbgs() << REMAT_PREFIX << " [" << I << "] " << OptIt->getSecond()
- << '\n';
- }
+ for (auto [I, Target] : enumerate(RPTargets)) {
+ Target.setTarget(MaxSGPRs, MaxVGPRs);
+ if (!Target.satisfied())
+ TargetRegions.set(I);
}
- });
- if (OptRegions.empty())
- return false;
+ }
- // Accounts for a reduction in RP in an optimizable region. Returns whether we
- // estimate that we have identified enough rematerialization opportunities to
- // achieve our goal, and sets Progress to true when this particular reduction
- // in pressure was helpful toward that goal.
- auto ReduceRPInRegion = [&](auto OptIt, Register Reg, LaneBitmask Mask,
- bool &Progress) -> bool {
- GCNRPTarget &Target = OptIt->getSecond();
- if (!Target.isSaveBeneficial(Reg))
- return false;
- Progress = true;
- Target.saveReg(Reg, Mask, DAG.MRI);
- if (Target.satisfied())
- OptRegions.erase(OptIt->getFirst());
- return OptRegions.empty();
- };
+ return TargetRegions.any();
+}
+bool PreRARematStage::collectRematRegs(
+ const DenseMap<MachineInstr *, unsigned> &MIRegion) {
// We need up-to-date live-out info. to query live-out register masks in
// regions containing rematerializable instructions.
DAG.RegionLiveOuts.buildLiveRegMap();
- // Cache set of registers that are going to be rematerialized.
- DenseSet<unsigned> RematRegs;
+ // Set of registers already marked for potential remterialization; used to
+ // avoid rematerialization chains.
+ SmallSet<Register, 4> MarkedRegs;
+ auto IsMarkedForRemat = [&MarkedRegs](const MachineOperand &MO) -> bool {
+ return MO.isReg() && MarkedRegs.contains(MO.getReg());
+ };
// Identify rematerializable instructions in the function.
for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
- auto Region = DAG.Regions[I];
- for (auto MI = Region.first; MI != Region.second; ++MI) {
+ RegionBoundaries Bounds = DAG.Regions[I];
+ for (auto MI = Bounds.first; MI != Bounds.second; ++MI) {
// The instruction must be rematerializable.
MachineInstr &DefMI = *MI;
if (!isReMaterializable(DefMI))
continue;
- // We only support rematerializing virtual registers with one definition.
+ // We only support rematerializing virtual registers with one
+ // definition.
Register Reg = DefMI.getOperand(0).getReg();
if (!Reg.isVirtual() || !DAG.MRI.hasOneDef(Reg))
continue;
// We only care to rematerialize the instruction if it has a single
- // non-debug user in a different region. The using MI may not belong to a
- // region if it is a lone region terminator.
+ // non-debug user in a different region.
+ // FIXME: Allow rematerializations with multiple uses. This should be
+ // relatively easy to support using the current cost model.
MachineInstr *UseMI = DAG.MRI.getOneNonDBGUser(Reg);
if (!UseMI)
continue;
auto UseRegion = MIRegion.find(UseMI);
- if (UseRegion != MIRegion.end() && UseRegion->second == I)
+ if (UseRegion == MIRegion.end() || UseRegion->second == I)
continue;
// Do not rematerialize an instruction if it uses or is used by an
// instruction that we have designated for rematerialization.
// FIXME: Allow for rematerialization chains: this requires 1. updating
- // remat points to account for uses that are rematerialized, and 2. either
- // rematerializing the candidates in careful ordering, or deferring the
- // MBB RP walk until the entire chain has been rematerialized.
- if (Rematerializations.contains(UseMI) ||
- llvm::any_of(DefMI.operands(), [&RematRegs](MachineOperand &MO) {
- return MO.isReg() && RematRegs.contains(MO.getReg());
- }))
+ // remat points to account for uses that are rematerialized, and 2.
+ // either rematerializing the candidates in careful ordering, or
+ // deferring the MBB RP walk until the entire chain has been
+ // rematerialized.
+ const MachineOperand &UseMO = UseMI->getOperand(0);
+ if (IsMarkedForRemat(UseMO) ||
+ llvm::any_of(DefMI.operands(), IsMarkedForRemat))
continue;
// Do not rematerialize an instruction it it uses registers that aren't
@@ -1937,188 +2801,257 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
*DAG.TII))
continue;
- REMAT_DEBUG(dbgs() << "Region " << I << ": remat instruction " << DefMI);
- RematInstruction &Remat =
- Rematerializations.try_emplace(&DefMI, UseMI).first->second;
-
- bool RematUseful = false;
- if (auto It = OptRegions.find(I); It != OptRegions.end()) {
- // Optimistically consider that moving the instruction out of its
- // defining region will reduce RP in the latter; this assumes that
- // maximum RP in the region is reached somewhere between the defining
- // instruction and the end of the region.
- REMAT_DEBUG(dbgs() << " Defining region is optimizable\n");
- LaneBitmask Mask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I)[Reg];
- if (ReduceRPInRegion(It, Reg, Mask, RematUseful))
- return true;
- }
-
- for (unsigned LIRegion = 0; LIRegion != E; ++LIRegion) {
- // We are only collecting regions in which the register is a live-in
- // (and may be live-through).
- auto It = DAG.LiveIns[LIRegion].find(Reg);
- if (It == DAG.LiveIns[LIRegion].end() || It->second.none())
- continue;
- Remat.LiveInRegions.insert(LIRegion);
-
- // Account for the reduction in RP due to the rematerialization in an
- // optimizable region in which the defined register is a live-in. This
- // is exact for live-through region but optimistic in the using region,
- // where RP is actually reduced only if maximum RP is reached somewhere
- // between the beginning of the region and the rematerializable
- // instruction's use.
- if (auto It = OptRegions.find(LIRegion); It != OptRegions.end()) {
- REMAT_DEBUG(dbgs() << " Live-in in region " << LIRegion << '\n');
- if (ReduceRPInRegion(It, Reg, DAG.LiveIns[LIRegion][Reg],
- RematUseful))
- return true;
- }
- }
-
- // If the instruction is not a live-in or live-out in any optimizable
- // region then there is no point in rematerializing it.
- if (!RematUseful) {
- Rematerializations.pop_back();
- REMAT_DEBUG(dbgs() << " No impact, not rematerializing instruction\n");
- } else {
- RematRegs.insert(Reg);
- }
+ // Add the instruction to the rematerializable list.
+ MarkedRegs.insert(Reg);
+ RematRegs.emplace_back(&DefMI, UseMI, DAG, MIRegion);
}
}
- if (TargetOcc) {
- // We were trying to increase occupancy but failed, abort the stage.
- REMAT_DEBUG(dbgs() << "Cannot increase occupancy\n");
- Rematerializations.clear();
- return false;
+ return !RematRegs.empty();
+}
+
+PreRARematStage::RematReg::RematReg(
+ MachineInstr *DefMI, MachineInstr *UseMI, GCNScheduleDAGMILive &DAG,
+ const DenseMap<MachineInstr *, unsigned> &MIRegion)
+ : DefMI(DefMI), UseMI(UseMI), LiveIn(DAG.Regions.size()),
+ LiveOut(DAG.Regions.size()), Live(DAG.Regions.size()),
+ DefRegion(MIRegion.at(DefMI)), UseRegion(MIRegion.at(UseMI)) {
+
+ // Mark regions in which the rematerializable register is live.
+ Register Reg = getReg();
+ for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+ auto LiveInIt = DAG.LiveIns[I].find(Reg);
+ if (LiveInIt != DAG.LiveIns[I].end())
+ LiveIn.set(I);
+ const auto &LiveOuts = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I);
+ if (auto LiveOutIt = LiveOuts.find(Reg); LiveOutIt != LiveOuts.end())
+ LiveOut.set(I);
+ }
+ Live |= LiveIn;
+ Live |= LiveOut;
+ Mask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(DefRegion).at(Reg);
+}
+
+bool PreRARematStage::RematReg::maybeBeneficial(
+ const BitVector &TargetRegions, ArrayRef<GCNRPTarget> RPTargets) const {
+ Register Reg = getReg();
+ for (unsigned I : TargetRegions.set_bits()) {
+ if (Live[I] && RPTargets[I].isSaveBeneficial(Reg))
+ return true;
}
- REMAT_DEBUG(dbgs() << "Can reduce but not eliminate spilling\n");
- return !Rematerializations.empty();
+ return false;
}
-void PreRARematStage::rematerialize() {
- const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();
+void PreRARematStage::RematReg::insertMI(unsigned RegionIdx,
+ MachineInstr *RematMI,
+ GCNScheduleDAGMILive &DAG) const {
+ RegionBoundaries &Bounds = DAG.Regions[RegionIdx];
+ if (Bounds.first == std::next(MachineBasicBlock::iterator(RematMI)))
+ Bounds.first = RematMI;
+ DAG.LIS->InsertMachineInstrInMaps(*RematMI);
+ DAG.LIS->createAndComputeVirtRegInterval(RematMI->getOperand(0).getReg());
+}
+
+PreRARematStage::ScoredRemat::FreqInfo::FreqInfo(
+ MachineFunction &MF, const GCNScheduleDAGMILive &DAG) {
+ assert(DAG.MLI && "MLI not defined in DAG");
+ MachineBranchProbabilityInfo MBPI;
+ MachineBlockFrequencyInfo MBFI(MF, MBPI, *DAG.MLI);
- // Collect regions whose RP changes in unpredictable way; we will have to
- // fully recompute their RP after all rematerailizations.
- DenseSet<unsigned> RecomputeRP;
-
- // Rematerialize all instructions.
- for (auto &[DefMI, Remat] : Rematerializations) {
- MachineBasicBlock::iterator InsertPos(Remat.UseMI);
- Register Reg = DefMI->getOperand(0).getReg();
- unsigned DefRegion = MIRegion.at(DefMI);
-
- // Rematerialize DefMI to its use block.
- TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
- AMDGPU::NoSubRegister, *DefMI, *DAG.TRI);
- Remat.RematMI = &*std::prev(InsertPos);
- DAG.LIS->InsertMachineInstrInMaps(*Remat.RematMI);
-
- // Update region boundaries in regions we sinked from (remove defining MI)
- // and to (insert MI rematerialized in use block). Only then we can erase
- // the original MI.
- DAG.updateRegionBoundaries(DAG.Regions[DefRegion], DefMI, nullptr);
- auto UseRegion = MIRegion.find(Remat.UseMI);
- if (UseRegion != MIRegion.end()) {
- DAG.updateRegionBoundaries(DAG.Regions[UseRegion->second], InsertPos,
- Remat.RematMI);
+ const unsigned NumRegions = DAG.Regions.size();
+ MinFreq = MBFI.getEntryFreq().getFrequency();
+ MaxFreq = 0;
+ Regions.reserve(NumRegions);
+ for (unsigned I = 0; I < NumRegions; ++I) {
+ MachineBasicBlock *MBB = DAG.Regions[I].first->getParent();
+ uint64_t BlockFreq = MBFI.getBlockFreq(MBB).getFrequency();
+ Regions.push_back(BlockFreq);
+ if (BlockFreq && BlockFreq < MinFreq)
+ MinFreq = BlockFreq;
+ else if (BlockFreq > MaxFreq)
+ MaxFreq = BlockFreq;
+ }
+ if (!MinFreq)
+ return;
+
+ // Scale everything down if frequencies are high.
+ if (MinFreq >= ScaleFactor * ScaleFactor) {
+ for (uint64_t &Freq : Regions)
+ Freq /= ScaleFactor;
+ MinFreq /= ScaleFactor;
+ MaxFreq /= ScaleFactor;
+ }
+}
+
+PreRARematStage::ScoredRemat::ScoredRemat(RematReg *Remat, const FreqInfo &Freq,
+ const GCNScheduleDAGMILive &DAG)
+ : Remat(Remat), NumRegs(getNumRegs(DAG)), FreqDiff(getFreqDiff(Freq)) {}
+
+unsigned PreRARematStage::ScoredRemat::getNumRegs(
+ const GCNScheduleDAGMILive &DAG) const {
+ const TargetRegisterClass &RC = *DAG.MRI.getRegClass(Remat->getReg());
+ unsigned RegSize = DAG.TRI->getRegSizeInBits(RC);
+ if (unsigned SubIdx = Remat->DefMI->getOperand(0).getSubReg()) {
+ // The following may return -1 (i.e., a large unsigned number) on indices
+ // that may be used to access subregisters of multiple sizes; in such cases
+ // fallback on the size derived from the register class.
+ unsigned SubRegSize = DAG.TRI->getSubRegIdxSize(SubIdx);
+ if (SubRegSize < RegSize)
+ RegSize = SubRegSize;
+ }
+ return divideCeil(RegSize, 32);
+}
+
+int64_t PreRARematStage::ScoredRemat::getFreqDiff(const FreqInfo &Freq) const {
+ // Get frequencies of defining and using regions. A rematerialization from the
+ // least frequent region to the most frequent region will yield the greatest
+ // latency penalty and therefore should get minimum score. Reciprocally, a
+ // rematerialization in the other direction should get maximum score. Default
+ // to values that will yield the worst possible score given known frequencies
+ // in order to penalize rematerializations from or into regions whose
+ // frequency is unknown.
+ int64_t DefOrMin = std::max(Freq.Regions[Remat->DefRegion], Freq.MinFreq);
+ int64_t UseOrMax = Freq.Regions[Remat->UseRegion];
+ if (!UseOrMax)
+ UseOrMax = Freq.MaxFreq;
+ return DefOrMin - UseOrMax;
+}
+
+void PreRARematStage::ScoredRemat::update(const BitVector &TargetRegions,
+ ArrayRef<GCNRPTarget> RPTargets,
+ const FreqInfo &FreqInfo,
+ bool ReduceSpill) {
+ MaxFreq = 0;
+ RegionImpact = 0;
+ for (unsigned I : TargetRegions.set_bits()) {
+ if (!Remat->Live[I] || !RPTargets[I].isSaveBeneficial(Remat->getReg()))
+ continue;
+ bool UnusedLT = Remat->isUnusedLiveThrough(I);
+
+ // Regions in which RP is guaranteed to decrease have more weight.
+ RegionImpact += UnusedLT ? 2 : 1;
+
+ if (ReduceSpill) {
+ uint64_t Freq = FreqInfo.Regions[I];
+ if (!UnusedLT) {
+ // Apply a frequency penalty in regions in which we are not sure that RP
+ // will decrease.
+ Freq /= 2;
+ }
+ MaxFreq = std::max(MaxFreq, Freq);
}
- DAG.LIS->RemoveMachineInstrFromMaps(*DefMI);
- DefMI->eraseFromParent();
+ }
+ RegionImpact *= NumRegs;
+}
- // Collect all regions impacted by the rematerialization and update their
- // live-in/RP information.
- for (unsigned I : Remat.LiveInRegions) {
- ImpactedRegions.insert({I, DAG.Pressure[I]});
- GCNRPTracker::LiveRegSet &RegionLiveIns = DAG.LiveIns[I];
+void PreRARematStage::rematerialize(const RematReg &Remat,
+ BitVector &RecomputeRP,
+ RollbackInfo *Rollback) {
+ const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();
+ MachineInstr &DefMI = *Remat.DefMI;
+ Register Reg = DefMI.getOperand(0).getReg();
+ Register NewReg = DAG.MRI.cloneVirtualRegister(Reg);
+
+ // Rematerialize the register in the region where it is used.
+ MachineBasicBlock::iterator InsertPos = Remat.UseMI;
+ TII->reMaterialize(*InsertPos->getParent(), InsertPos, NewReg, 0, DefMI);
+ MachineInstr *RematMI = &*std::prev(InsertPos);
+ Remat.UseMI->substituteRegister(Reg, NewReg, 0, *DAG.TRI);
+ Remat.insertMI(Remat.UseRegion, RematMI, DAG);
+ if (Rollback) {
+ Rollback->RematMI = RematMI;
+ // Make the original MI a debug value so that it does not influence
+ // scheduling and replace all read registers with a sentinel register to
+ // prevent operands to appear in use-lists of other MIs during LIS
+ // updates. Store mappings between operand indices and original registers
+ // for potential rollback.
+ DefMI.setDesc(TII->get(TargetOpcode::DBG_VALUE));
+ for (auto [Idx, MO] : enumerate(Remat.DefMI->operands())) {
+ if (MO.isReg() && MO.readsReg()) {
+ Rollback->RegMap.insert({Idx, MO.getReg()});
+ MO.setReg(Register());
+ }
+ }
+ } else {
+ // Just delete the original instruction if it cannot be rolled back.
+ DAG.deleteMI(Remat.DefRegion, &DefMI);
+ }
#ifdef EXPENSIVE_CHECKS
- // All uses are known to be available / live at the remat point. Thus, the
- // uses should already be live in to the region.
- for (MachineOperand &MO : DefMI->operands()) {
- if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
- continue;
+ // All uses are known to be available / live at the remat point. Thus,
+ // the uses should already be live in to the using region.
+ for (MachineOperand &MO : DefMI.operands()) {
+ if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
+ continue;
- Register UseReg = MO.getReg();
- if (!UseReg.isVirtual())
- continue;
+ Register UseReg = MO.getReg();
+ if (!UseReg.isVirtual())
+ continue;
- LiveInterval &LI = DAG.LIS->getInterval(UseReg);
- LaneBitmask LM = DAG.MRI.getMaxLaneMaskForVReg(MO.getReg());
- if (LI.hasSubRanges() && MO.getSubReg())
- LM = DAG.TRI->getSubRegIndexLaneMask(MO.getSubReg());
-
- LaneBitmask LiveInMask = RegionLiveIns.at(UseReg);
- LaneBitmask UncoveredLanes = LM & ~(LiveInMask & LM);
- // If this register has lanes not covered by the LiveIns, be sure they
- // do not map to any subrange. ref:
- // machine-scheduler-sink-trivial-remats.mir::omitted_subrange
- if (UncoveredLanes.any()) {
- assert(LI.hasSubRanges());
- for (LiveInterval::SubRange &SR : LI.subranges())
- assert((SR.LaneMask & UncoveredLanes).none());
- }
- }
+ LiveInterval &LI = DAG.LIS->getInterval(UseReg);
+ LaneBitmask LM = DAG.MRI.getMaxLaneMaskForVReg(MO.getReg());
+ if (LI.hasSubRanges() && MO.getSubReg())
+ LM = DAG.TRI->getSubRegIndexLaneMask(MO.getSubReg());
+
+ LaneBitmask LiveInMask = DAG.LiveIns[Remat.UseRegion].at(UseReg);
+ LaneBitmask UncoveredLanes = LM & ~(LiveInMask & LM);
+ // If this register has lanes not covered by the LiveIns, be sure they
+ // do not map to any subrange. ref:
+ // machine-scheduler-sink-trivial-remats.mir::omitted_subrange
+ if (UncoveredLanes.any()) {
+ assert(LI.hasSubRanges());
+ for (LiveInterval::SubRange &SR : LI.subranges())
+ assert((SR.LaneMask & UncoveredLanes).none());
+ }
+ }
#endif
- // The register is no longer a live-in in all regions but the one that
- // contains the single use. In live-through regions, maximum register
- // pressure decreases predictably so we can directly update it. In the
- // using region, maximum RP may or may not decrease, so we will mark it
- // for re-computation after all materializations have taken place.
- LaneBitmask PrevMask = RegionLiveIns[Reg];
- RegionLiveIns.erase(Reg);
- RegMasks.insert({{I, Remat.RematMI->getOperand(0).getReg()}, PrevMask});
- if (Remat.UseMI->getParent() != DAG.Regions[I].first->getParent())
- DAG.Pressure[I].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI);
- else
- RecomputeRP.insert(I);
+ // Remove the register from all regions where it is a live-in or live-out
+ // and adjust RP targets. The save is guaranteed in regions in which the
+ // register is live-through and unused but optimistic in all other regions
+ // where the register is live.
+ for (unsigned I : Remat.Live.set_bits()) {
+ RPTargets[I].saveReg(Reg, Remat.Mask, DAG.MRI);
+ DAG.LiveIns[I].erase(Reg);
+ DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I).erase(Reg);
+ if (!Remat.isUnusedLiveThrough(I))
+ RecomputeRP.set(I);
+ }
+
+ RescheduleRegions |= Remat.Live;
+}
+
+void PreRARematStage::commitRematerializations() const {
+ REMAT_DEBUG(dbgs() << "Commiting all rematerializations\n");
+ for (const RollbackInfo &Rollback : Rollbacks)
+ DAG.deleteMI(Rollback.Remat->DefRegion, Rollback.Remat->DefMI);
+}
+
+void PreRARematStage::unsetSatisifedRPTargets(const BitVector &Regions) {
+ for (unsigned I : Regions.set_bits()) {
+ if (TargetRegions[I] && RPTargets[I].satisfied()) {
+ REMAT_DEBUG(dbgs() << " [" << I << "] Target reached!\n");
+ TargetRegions.reset(I);
}
- // RP in the region from which the instruction was rematerialized may or may
- // not decrease.
- ImpactedRegions.insert({DefRegion, DAG.Pressure[DefRegion]});
- RecomputeRP.insert(DefRegion);
-
- // Recompute live interval to reflect the register's rematerialization.
- Register RematReg = Remat.RematMI->getOperand(0).getReg();
- DAG.LIS->removeInterval(RematReg);
- DAG.LIS->createAndComputeVirtRegInterval(RematReg);
- }
-
- // All regions impacted by at least one rematerialization must be rescheduled.
- // Maximum pressure must also be recomputed for all regions where it changed
- // non-predictably and checked against the target occupancy.
- unsigned DynamicVGPRBlockSize =
- MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
- AchievedOcc = MFI.getMaxWavesPerEU();
- for (auto &[I, OriginalRP] : ImpactedRegions) {
- bool IsEmptyRegion = DAG.Regions[I].first == DAG.Regions[I].second;
- RescheduleRegions[I] = !IsEmptyRegion;
- if (!RecomputeRP.contains(I))
- continue;
+ }
+}
- GCNRegPressure RP;
- if (IsEmptyRegion) {
- RP = getRegPressure(DAG.MRI, DAG.LiveIns[I]);
- } else {
- GCNDownwardRPTracker RPT(*DAG.LIS);
- auto *NonDbgMI = &*skipDebugInstructionsForward(DAG.Regions[I].first,
- DAG.Regions[I].second);
- if (NonDbgMI == DAG.Regions[I].second) {
- // Region is non-empty but contains only debug instructions.
- RP = getRegPressure(DAG.MRI, DAG.LiveIns[I]);
- } else {
- RPT.reset(*NonDbgMI, &DAG.LiveIns[I]);
- RPT.advance(DAG.Regions[I].second);
- RP = RPT.moveMaxPressure();
- }
+bool PreRARematStage::updateAndVerifyRPTargets(const BitVector &Regions) {
+ bool TooOptimistic = false;
+ for (unsigned I : Regions.set_bits()) {
+ GCNRPTarget &Target = RPTargets[I];
+ Target.setRP(DAG.getRealRegPressure(I));
+
+ // Since we were optimistic in assessing RP decreases in these regions, we
+ // may need to remark the target as a target region if RP didn't decrease
+ // as expected.
+ if (!TargetRegions[I] && !Target.satisfied()) {
+ REMAT_DEBUG(dbgs() << " [" << I << "] Incorrect RP estimation\n");
+ TooOptimistic = true;
+ TargetRegions.set(I);
}
- DAG.Pressure[I] = RP;
- AchievedOcc =
- std::min(AchievedOcc, RP.getOccupancy(ST, DynamicVGPRBlockSize));
}
- REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n");
+ return TooOptimistic;
}
// Copied from MachineLICM
@@ -2141,80 +3074,116 @@ bool PreRARematStage::isReMaterializable(const MachineInstr &MI) {
void PreRARematStage::finalizeGCNSchedStage() {
// We consider that reducing spilling is always beneficial so we never
- // rollback rematerializations in such cases. It's also possible that
- // rescheduling lowers occupancy over the one achieved just through remats, in
- // which case we do not want to rollback either (the rescheduling was already
- // reverted in PreRARematStage::shouldRevertScheduling in such cases).
- unsigned MaxOcc = std::max(AchievedOcc, DAG.MinOccupancy);
- if (!TargetOcc || MaxOcc >= *TargetOcc)
+ // rollback rematerializations or revert scheduling in such cases.
+ if (!TargetOcc)
return;
- REMAT_DEBUG(dbgs() << "Rolling back all rematerializations\n");
- const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();
+ // When increasing occupancy, it is possible that re-scheduling is not able to
+ // achieve the target occupancy in all regions, in which case re-scheduling in
+ // all regions should be reverted.
+ if (DAG.MinOccupancy >= *TargetOcc) {
+ commitRematerializations();
+ return;
+ }
- // Rollback the rematerializations.
- for (const auto &[DefMI, Remat] : Rematerializations) {
- MachineInstr &RematMI = *Remat.RematMI;
- unsigned DefRegion = MIRegion.at(DefMI);
- MachineBasicBlock::iterator InsertPos(DAG.Regions[DefRegion].second);
- MachineBasicBlock *MBB = RegionBB[DefRegion];
- Register Reg = RematMI.getOperand(0).getReg();
-
- // Re-rematerialize MI at the end of its original region. Note that it may
- // not be rematerialized exactly in the same position as originally within
- // the region, but it should not matter much.
- TII->reMaterialize(*MBB, InsertPos, Reg, AMDGPU::NoSubRegister, RematMI,
- *DAG.TRI);
- MachineInstr *NewMI = &*std::prev(InsertPos);
- DAG.LIS->InsertMachineInstrInMaps(*NewMI);
-
- auto UseRegion = MIRegion.find(Remat.UseMI);
- if (UseRegion != MIRegion.end()) {
- DAG.updateRegionBoundaries(DAG.Regions[UseRegion->second], RematMI,
- nullptr);
+ // It is possible that re-scheduling lowers occupancy over the one achieved
+ // just through rematerializations, in which case we revert re-scheduling in
+ // all regions but do not roll back rematerializations.
+ const bool ShouldRollbackRemats = AchievedOcc < *TargetOcc;
+
+ // When we both need to revert re-scheduling and rollback rematerializations,
+ // restore rematerialized MIs' original state before reverting so that they
+ // are treated as non-debug instructions by the revert logic.
+ if (ShouldRollbackRemats) {
+ for (const RollbackInfo &Rollback : Rollbacks) {
+ const auto &[Remat, RematMI, RegMap] = Rollback;
+ Remat->DefMI->setDesc(DAG.TII->get(RematMI->getOpcode()));
+ for (const auto &[MOIdx, Reg] : RegMap)
+ Remat->DefMI->getOperand(MOIdx).setReg(Reg);
}
- DAG.updateRegionBoundaries(DAG.Regions[DefRegion], InsertPos, NewMI);
+ }
- // Erase rematerialized MI.
- DAG.LIS->RemoveMachineInstrFromMaps(RematMI);
- RematMI.eraseFromParent();
+ // Revert re-scheduling in all affected regions.
+ for (const auto &[RegionIdx, OrigMIOrder, MaxPressure] : RegionReverts) {
+ REMAT_DEBUG(dbgs() << "Reverting re-scheduling in region " << RegionIdx
+ << '\n');
+ DAG.Pressure[RegionIdx] = MaxPressure;
+ modifyRegionSchedule(RegionIdx, RegionBB[RegionIdx], OrigMIOrder);
+ }
- // Recompute live interval for the re-rematerialized register
+ if (!ShouldRollbackRemats) {
+ commitRematerializations();
+ DAG.setTargetOccupancy(AchievedOcc);
+ return;
+ }
+
+ // Reset the target occupancy to what it was pre-rematerialization.
+ DAG.setTargetOccupancy(*TargetOcc - 1);
+
+ // Finish rolling back rematerializations, then recompute pressure in all
+ // affected regions.
+ REMAT_DEBUG(dbgs() << "==== ROLLBACK ====\n");
+ BitVector RecomputeRP(DAG.Regions.size());
+ DenseSet<Register> RecomputeLI;
+ for (const RollbackInfo &Rollback : Rollbacks) {
+ const auto &[Remat, RematMI, RegMap] = Rollback;
+
+ // Switch back to using the original register and delete the
+ // rematerialization.
+ Register Reg = RematMI->getOperand(0).getReg();
+ Register OriginalReg = Remat->DefMI->getOperand(0).getReg();
+ Remat->UseMI->substituteRegister(Reg, OriginalReg, 0, *DAG.TRI);
+ REMAT_DEBUG(dbgs() << '[' << Remat->UseRegion
+ << "] Deleting rematerialization " << *RematMI);
+ DAG.deleteMI(Remat->UseRegion, RematMI);
+
+ // Re-add the defined register as a live-in/live-out in all regions it used
+ // to be one in.
+ std::pair<Register, LaneBitmask> LiveReg(OriginalReg, Remat->Mask);
+ for (unsigned I : Remat->LiveIn.set_bits())
+ DAG.LiveIns[I].insert(LiveReg);
+ for (unsigned I : Remat->LiveOut.set_bits())
+ DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I).insert(LiveReg);
+
+ RecomputeRP |= Rollback.Remat->Live;
+ // Regenerate intervals for all register operands of rematerialized MIs as
+ // slot indices may have changed slightly from before re-scheduling.
+ for (MachineOperand &MO : Rollback.Remat->DefMI->operands()) {
+ if (MO.isReg() && MO.getReg().isVirtual())
+ RecomputeLI.insert(MO.getReg());
+ }
+ }
+ for (Register Reg : RecomputeLI) {
DAG.LIS->removeInterval(Reg);
DAG.LIS->createAndComputeVirtRegInterval(Reg);
-
- // Re-add the register as a live-in in all regions it used to be one in.
- for (unsigned LIRegion : Remat.LiveInRegions)
- DAG.LiveIns[LIRegion].insert({Reg, RegMasks.at({LIRegion, Reg})});
}
-
- // Reset RP in all impacted regions.
- for (auto &[I, OriginalRP] : ImpactedRegions)
- DAG.Pressure[I] = OriginalRP;
+#ifdef EXPENSIVE_CHECKS
+ // In particular, we want to check for coherent MI/slot order in regions in
+ // which reverts and/or rollbacks may have happened.
+ MF.verify();
+#endif
+ for (unsigned I : RecomputeRP.set_bits())
+ DAG.Pressure[I] = DAG.getRealRegPressure(I);
GCNSchedStage::finalizeGCNSchedStage();
}
-void GCNScheduleDAGMILive::updateRegionBoundaries(
- RegionBoundaries &RegionBounds, MachineBasicBlock::iterator MI,
- MachineInstr *NewMI) {
- assert((!NewMI || NewMI != RegionBounds.second) &&
- "cannot remove at region end");
-
- if (RegionBounds.first == RegionBounds.second) {
- assert(NewMI && "cannot remove from an empty region");
- RegionBounds.first = NewMI;
- return;
- }
+void GCNScheduleDAGMILive::deleteMI(unsigned RegionIdx, MachineInstr *MI) {
+ // It's not possible for the deleted instruction to be upper region boundary
+ // since we don't delete region terminators.
+ if (Regions[RegionIdx].first == MI)
+ Regions[RegionIdx].first = std::next(MachineBasicBlock::iterator(MI));
+ LIS->removeInterval(MI->getOperand(0).getReg());
+ LIS->RemoveMachineInstrFromMaps(*MI);
+ MI->eraseFromParent();
+}
- // We only care for modifications at the beginning of a non-empty region since
- // the upper region boundary is exclusive.
- if (MI != RegionBounds.first)
- return;
- if (!NewMI)
- RegionBounds.first = std::next(MI); // Removal
+void GCNScheduleDAGMILive::setTargetOccupancy(unsigned TargetOccupancy) {
+ MinOccupancy = TargetOccupancy;
+ if (MFI.getOccupancy() < TargetOccupancy)
+ MFI.increaseOccupancy(MF, MinOccupancy);
else
- RegionBounds.first = NewMI; // Insertion
+ MFI.limitOccupancy(MinOccupancy);
}
static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) {
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 975781f..6b6a403 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -16,6 +16,9 @@
#include "GCNRegPressure.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/MapVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineScheduler.h"
@@ -28,11 +31,12 @@ class GCNSchedStage;
enum class GCNSchedStageID : unsigned {
OccInitialSchedule = 0,
- UnclusteredHighRPReschedule = 1,
- ClusteredLowOccupancyReschedule = 2,
- PreRARematerialize = 3,
- ILPInitialSchedule = 4,
- MemoryClauseInitialSchedule = 5
+ RewriteMFMAForm = 1,
+ UnclusteredHighRPReschedule = 2,
+ ClusteredLowOccupancyReschedule = 3,
+ PreRARematerialize = 4,
+ ILPInitialSchedule = 5,
+ MemoryClauseInitialSchedule = 6
};
#ifndef NDEBUG
@@ -183,7 +187,7 @@ class ScheduleMetrics {
unsigned BubbleCycles;
public:
- ScheduleMetrics() {}
+ ScheduleMetrics() = default;
ScheduleMetrics(unsigned L, unsigned BC)
: ScheduleLength(L), BubbleCycles(BC) {}
unsigned getLength() const { return ScheduleLength; }
@@ -198,8 +202,7 @@ public:
};
inline raw_ostream &operator<<(raw_ostream &OS, const ScheduleMetrics &Sm) {
- dbgs() << "\n Schedule Metric (scaled by "
- << ScheduleMetrics::ScaleFactor
+ dbgs() << "\n Schedule Metric (scaled by " << ScheduleMetrics::ScaleFactor
<< " ) is: " << Sm.getMetric() << " [ " << Sm.getBubbles() << "/"
<< Sm.getLength() << " ]\n";
return OS;
@@ -217,7 +220,7 @@ class RegionPressureMap {
bool IsLiveOut;
public:
- RegionPressureMap() {}
+ RegionPressureMap() = default;
RegionPressureMap(GCNScheduleDAGMILive *GCNDAG, bool LiveOut)
: DAG(GCNDAG), IsLiveOut(LiveOut) {}
// Build the Instr->LiveReg and RegionIdx->Instr maps
@@ -239,6 +242,7 @@ using RegionBoundaries =
class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
friend class GCNSchedStage;
friend class OccInitialScheduleStage;
+ friend class RewriteMFMAFormStage;
friend class UnclusteredHighRPStage;
friend class ClusteredLowOccStage;
friend class PreRARematStage;
@@ -300,18 +304,15 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
// Compute and cache live-ins and pressure for all regions in block.
void computeBlockPressure(unsigned RegionIdx, const MachineBasicBlock *MBB);
- /// If necessary, updates a region's boundaries following insertion ( \p NewMI
- /// != nullptr) or removal ( \p NewMI == nullptr) of a \p MI in the region.
- /// For an MI removal, this must be called before the MI is actually erased
- /// from its parent MBB.
- void updateRegionBoundaries(RegionBoundaries &RegionBounds,
- MachineBasicBlock::iterator MI,
- MachineInstr *NewMI);
+ /// Makes the scheduler try to achieve an occupancy of \p TargetOccupancy.
+ void setTargetOccupancy(unsigned TargetOccupancy);
void runSchedStages();
std::unique_ptr<GCNSchedStage> createSchedStage(GCNSchedStageID SchedStageID);
+ void deleteMI(unsigned RegionIdx, MachineInstr *MI);
+
public:
GCNScheduleDAGMILive(MachineSchedContext *C,
std::unique_ptr<MachineSchedStrategy> S);
@@ -367,12 +368,12 @@ public:
// be skipped.
virtual bool initGCNRegion();
+ // Finalize state after scheduling a region.
+ virtual void finalizeGCNRegion();
+
// Track whether a new region is also a new MBB.
void setupNewBlock();
- // Finalize state after scheudling a region.
- void finalizeGCNRegion();
-
// Check result of scheduling.
void checkScheduling();
@@ -397,8 +398,12 @@ public:
// Returns true if the new schedule may result in more spilling.
bool mayCauseSpilling(unsigned WavesAfter);
- // Attempt to revert scheduling for this region.
- void revertScheduling();
+ /// Sets the schedule of region \p RegionIdx in block \p MBB to \p MIOrder.
+ /// The MIs in \p MIOrder must be exactly the same as the ones currently
+ /// existing inside the region, only in a different order that honors def-use
+ /// chains.
+ void modifyRegionSchedule(unsigned RegionIdx, MachineBasicBlock *MBB,
+ ArrayRef<MachineInstr *> MIOrder);
void advanceRegion() { RegionIdx++; }
@@ -413,10 +418,67 @@ public:
: GCNSchedStage(StageID, DAG) {}
};
+class RewriteMFMAFormStage : public GCNSchedStage {
+private:
+ // Record regions with excess archvgpr register pressure over the physical
+ // register limit. Register pressure in these regions usually will result in
+ // spilling.
+ BitVector RegionsWithExcessArchVGPR;
+
+ const SIInstrInfo *TII;
+ const SIRegisterInfo *SRI;
+
+ /// Do a speculative rewrite and collect copy locations. The speculative
+ /// rewrite allows us to calculate the RP of the code after the rewrite, and
+ /// the copy locations allow us to calculate the total cost of copies required
+ /// for the rewrite. Stores the rewritten instructions in \p RewriteCands ,
+ /// the copy locations for uses (of the MFMA result) in \p CopyForUse and the
+ /// copy locations for defs (of the MFMA operands) in \p CopyForDef
+ bool
+ initHeuristics(std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
+ DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
+ SmallPtrSetImpl<MachineInstr *> &CopyForDef);
+
+ /// Calculate the rewrite cost and undo the state change (e.g. rewriting) done
+ /// in initHeuristics. Uses \p CopyForUse and \p CopyForDef to calculate copy
+ /// costs, and \p RewriteCands to undo rewriting.
+ int64_t getRewriteCost(
+ const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
+ const DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
+ const SmallPtrSetImpl<MachineInstr *> &CopyForDef);
+
+ /// Do the final rewrite on \p RewriteCands and insert any needed copies.
+ bool
+ rewrite(const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands);
+
+ /// \returns true if this MI is a rewrite candidate.
+ bool isRewriteCandidate(MachineInstr *MI) const;
+
+ /// Finds all the reaching defs of \p UseMO and stores the SlotIndexes into \p
+ /// DefIdxs
+ void findReachingDefs(MachineOperand &UseMO, LiveIntervals *LIS,
+ SmallVectorImpl<SlotIndex> &DefIdxs);
+
+ /// Finds all the reaching uses of \p DefMI and stores the use operands in \p
+ /// ReachingUses
+ void findReachingUses(MachineInstr *DefMI, LiveIntervals *LIS,
+ SmallVectorImpl<MachineOperand *> &ReachingUses);
+
+public:
+ bool initGCNSchedStage() override;
+
+ RewriteMFMAFormStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
+ : GCNSchedStage(StageID, DAG) {}
+};
+
class UnclusteredHighRPStage : public GCNSchedStage {
private:
// Save the initial occupancy before starting this stage.
unsigned InitialOccupancy;
+ // Save the temporary target occupancy before starting this stage.
+ unsigned TempTargetOccupancy;
+ // Track whether any region was scheduled by this stage.
+ bool IsAnyRegionScheduled;
public:
bool initGCNSchedStage() override;
@@ -447,65 +509,242 @@ public:
};
/// Attempts to reduce function spilling or, if there is no spilling, to
-/// increase function occupancy by one with respect to ArchVGPR usage by sinking
-/// rematerializable instructions to their use. When the stage
-/// estimates reducing spilling or increasing occupancy is possible, as few
-/// instructions as possible are rematerialized to reduce potential negative
+/// increase function occupancy by one with respect to register usage by sinking
+/// rematerializable instructions to their use. When the stage estimates that
+/// reducing spilling or increasing occupancy is possible, it tries to
+/// rematerialize as few registers as possible to reduce potential negative
/// effects on function latency.
+///
+/// The stage only supports rematerializing registers that meet all of the
+/// following constraints.
+/// 1. The register is virtual and has a single defining instruction.
+/// 2. The single defining instruction is either deemed rematerializable by the
+/// target-independent logic, or if not, has no non-constant and
+/// non-ignorable physical register use.
+/// 3 The register has no virtual register use whose live range would be
+/// extended by the rematerialization.
+/// 4. The register has a single non-debug user in a different region from its
+/// defining region.
+/// 5. The register is not used by or using another register that is going to be
+/// rematerialized.
class PreRARematStage : public GCNSchedStage {
private:
- /// Useful information about a rematerializable instruction.
- struct RematInstruction {
- /// Single use of the rematerializable instruction's defined register,
- /// located in a different block.
+ /// A rematerializable register.
+ struct RematReg {
+ /// Single MI defining the rematerializable register.
+ MachineInstr *DefMI;
+ /// Single user of the rematerializable register.
MachineInstr *UseMI;
- /// Rematerialized version of \p DefMI, set in
- /// PreRARematStage::rematerialize. Used for reverting rematerializations.
- MachineInstr *RematMI;
- /// Set of regions in which the rematerializable instruction's defined
- /// register is a live-in.
- SmallDenseSet<unsigned, 4> LiveInRegions;
+ /// Regions in which the register is live-in/live-out/live anywhere.
+ BitVector LiveIn, LiveOut, Live;
+ /// The rematerializable register's lane bitmask.
+ LaneBitmask Mask;
+ /// Defining and using regions.
+ unsigned DefRegion, UseRegion;
+
+ RematReg(MachineInstr *DefMI, MachineInstr *UseMI,
+ GCNScheduleDAGMILive &DAG,
+ const DenseMap<MachineInstr *, unsigned> &MIRegion);
+
+ /// Returns the rematerializable register. Do not call after deleting the
+ /// original defining instruction.
+ Register getReg() const { return DefMI->getOperand(0).getReg(); }
+
+ /// Determines whether this rematerialization may be beneficial in at least
+ /// one target region.
+ bool maybeBeneficial(const BitVector &TargetRegions,
+ ArrayRef<GCNRPTarget> RPTargets) const;
+
+ /// Determines if the register is both unused and live-through in region \p
+ /// I. This guarantees that rematerializing it will reduce RP in the region.
+ bool isUnusedLiveThrough(unsigned I) const {
+ assert(I < Live.size() && "region index out of range");
+ return LiveIn[I] && LiveOut[I] && I != UseRegion;
+ }
+
+ /// Updates internal structures following a MI rematerialization. Part of
+ /// the stage instead of the DAG because it makes assumptions that are
+ /// specific to the rematerialization process.
+ void insertMI(unsigned RegionIdx, MachineInstr *RematMI,
+ GCNScheduleDAGMILive &DAG) const;
+ };
+
+ /// A scored rematerialization candidate. Higher scores indicate more
+ /// beneficial rematerializations. A null score indicate the rematerialization
+ /// is not helpful to reduce RP in target regions.
+ struct ScoredRemat {
+ /// The rematerializable register under consideration.
+ RematReg *Remat;
+
+ /// Execution frequency information required by scoring heuristics.
+ /// Frequencies are scaled down if they are high to avoid overflow/underflow
+ /// when combining them.
+ struct FreqInfo {
+ /// Per-region execution frequencies. 0 when unknown.
+ SmallVector<uint64_t> Regions;
+ /// Minimum and maximum observed frequencies.
+ uint64_t MinFreq, MaxFreq;
+
+ FreqInfo(MachineFunction &MF, const GCNScheduleDAGMILive &DAG);
+
+ private:
+ static const uint64_t ScaleFactor = 1024;
+ };
+
+ /// This only initializes state-independent characteristics of \p Remat, not
+ /// the actual score.
+ ScoredRemat(RematReg *Remat, const FreqInfo &Freq,
+ const GCNScheduleDAGMILive &DAG);
+
+ /// Updates the rematerialization's score w.r.t. the current \p RPTargets.
+ /// \p RegionFreq indicates the frequency of each region
+ void update(const BitVector &TargetRegions, ArrayRef<GCNRPTarget> RPTargets,
+ const FreqInfo &Freq, bool ReduceSpill);
+
+ /// Returns whether the current score is null, indicating the
+ /// rematerialization is useless.
+ bool hasNullScore() const { return !RegionImpact; }
+
+ /// Compare score components of non-null scores pair-wise. A null score is
+ /// always strictly lesser than another non-null score.
+ bool operator<(const ScoredRemat &O) const {
+ if (hasNullScore())
+ return !O.hasNullScore();
+ if (O.hasNullScore())
+ return false;
+ if (MaxFreq != O.MaxFreq)
+ return MaxFreq < O.MaxFreq;
+ if (FreqDiff != O.FreqDiff)
+ return FreqDiff < O.FreqDiff;
+ if (RegionImpact != O.RegionImpact)
+ return RegionImpact < O.RegionImpact;
+ // Break ties using pointer to rematerializable register. Rematerializable
+ // registers are collected in instruction order so, within the same
+ // region, this will prefer registers defined earlier that have longer
+ // live ranges in their defining region (since the registers we consider
+ // are always live-out in their defining region).
+ return Remat > O.Remat;
+ }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ Printable print() const;
+#endif
+
+ private:
+ /// Number of 32-bit registers this rematerialization covers.
+ unsigned NumRegs;
+
+ // The three members below are the scoring components, top to bottom from
+ // most important to least important when comparing candidates.
- RematInstruction(MachineInstr *UseMI) : UseMI(UseMI) {}
+ /// Frequency of impacted target region with highest known frequency. This
+ /// only matters when the stage is trying to reduce spilling, so it is
+ /// always 0 when it is not.
+ uint64_t MaxFreq;
+ /// Frequency difference between defining and using regions. Negative values
+ /// indicate we are rematerializing to higher frequency regions; positive
+ /// values indicate the contrary.
+ int64_t FreqDiff;
+ /// Expected number of target regions impacted by the rematerialization,
+ /// scaled by the size of the register being rematerialized.
+ unsigned RegionImpact;
+
+ unsigned getNumRegs(const GCNScheduleDAGMILive &DAG) const;
+
+ int64_t getFreqDiff(const FreqInfo &Freq) const;
};
- /// Maps all MIs to their parent region. MI terminators are considered to be
- /// outside the region they delimitate, and as such are not stored in the map.
- DenseMap<MachineInstr *, unsigned> MIRegion;
/// Parent MBB to each region, in region order.
SmallVector<MachineBasicBlock *> RegionBB;
- /// Collects instructions to rematerialize.
- MapVector<MachineInstr *, RematInstruction> Rematerializations;
- /// Collects regions whose live-ins or register pressure will change due to
- /// rematerializations.
- DenseMap<unsigned, GCNRegPressure> ImpactedRegions;
- /// In case we need to rollback rematerializations, save lane masks for all
- /// rematerialized registers in all regions in which they are live-ins.
- DenseMap<std::pair<unsigned, Register>, LaneBitmask> RegMasks;
- /// After successful stage initialization, indicates which regions should be
- /// rescheduled.
- BitVector RescheduleRegions;
- /// The target occupancy the stage is trying to achieve. Empty when the
+ /// Register pressure targets for all regions.
+ SmallVector<GCNRPTarget> RPTargets;
+ /// Regions which are above the stage's RP target.
+ BitVector TargetRegions;
+ /// The target occupancy the set is trying to achieve. Empty when the
/// objective is spilling reduction.
std::optional<unsigned> TargetOcc;
/// Achieved occupancy *only* through rematerializations (pre-rescheduling).
- /// Smaller than or equal to the target occupancy.
unsigned AchievedOcc;
+ /// After successful stage initialization, indicates which regions should be
+ /// rescheduled.
+ BitVector RescheduleRegions;
- /// Returns whether remat can reduce spilling or increase function occupancy
- /// by 1 through rematerialization. If it can do one, collects instructions in
- /// PreRARematStage::Rematerializations and sets the target occupancy in
- /// PreRARematStage::TargetOccupancy.
- bool canIncreaseOccupancyOrReduceSpill();
+ /// List of rematerializable registers.
+ SmallVector<RematReg> RematRegs;
+
+ /// Holds enough information to rollback a rematerialization decision post
+ /// re-scheduling.
+ struct RollbackInfo {
+ /// The rematerializable register under consideration.
+ const RematReg *Remat;
+ /// The rematerialized MI replacing the original defining MI.
+ MachineInstr *RematMI;
+ /// Maps register machine operand indices to their original register.
+ SmallDenseMap<unsigned, Register, 4> RegMap;
+
+ RollbackInfo(const RematReg *Remat) : Remat(Remat) {}
+ };
+ /// List of rematerializations to rollback if rematerialization does not end
+ /// up being beneficial.
+ SmallVector<RollbackInfo> Rollbacks;
+
+ /// State of a region pre-re-scheduling but post-rematerializations that we
+ /// must keep to be able to revert re-scheduling effects.
+ struct RegionSchedRevert {
+ /// Region number;
+ unsigned RegionIdx;
+ /// Original instruction order (both debug and non-debug MIs).
+ std::vector<MachineInstr *> OrigMIOrder;
+ /// Maximum pressure recorded in the region.
+ GCNRegPressure MaxPressure;
+
+ RegionSchedRevert(unsigned RegionIdx, ArrayRef<MachineInstr *> OrigMIOrder,
+ const GCNRegPressure &MaxPressure)
+ : RegionIdx(RegionIdx), OrigMIOrder(OrigMIOrder),
+ MaxPressure(MaxPressure) {}
+ };
+ /// After re-scheduling, contains pre-re-scheduling data for all re-scheduled
+ /// regions.
+ SmallVector<RegionSchedRevert> RegionReverts;
+
+ /// Returns the occupancy the stage is trying to achieve.
+ unsigned getStageTargetOccupancy() const;
+
+ /// Determines the stage's objective (increasing occupancy or reducing
+ /// spilling, set in \ref TargetOcc). Defines \ref RPTargets in all regions to
+ /// achieve that objective and mark those that don't achieve it in \ref
+ /// TargetRegions. Returns whether there is any target region.
+ bool setObjective();
+
+ /// Unsets target regions in \p Regions whose RP target has been reached.
+ void unsetSatisifedRPTargets(const BitVector &Regions);
+
+ /// Fully recomputes RP from the DAG in \p Regions. Among those regions, sets
+ /// again all \ref TargetRegions that were optimistically marked as satisfied
+ /// but are actually not, and returns whether there were any such regions.
+ bool updateAndVerifyRPTargets(const BitVector &Regions);
+
+ /// Collects all rematerializable registers and appends them to \ref
+ /// RematRegs. \p MIRegion maps MIs to their region. Returns whether any
+ /// rematerializable register was found.
+ bool collectRematRegs(const DenseMap<MachineInstr *, unsigned> &MIRegion);
+
+ /// Rematerializes \p Remat. This removes the rematerialized register from
+ /// live-in/out lists in the DAG and updates RP targets in all affected
+ /// regions, which are also marked in \ref RescheduleRegions. Regions in which
+ /// RP savings are not guaranteed are set in \p RecomputeRP. When \p Rollback
+ /// is non-null, fills it with required information to be able to rollback the
+ /// rematerialization post-rescheduling.
+ void rematerialize(const RematReg &Remat, BitVector &RecomputeRP,
+ RollbackInfo *Rollback);
+
+ /// Deletes all rematerialized MIs from the MIR when they were kept around for
+ /// potential rollback.
+ void commitRematerializations() const;
/// Whether the MI is rematerializable
bool isReMaterializable(const MachineInstr &MI);
- /// Rematerializes all instructions in PreRARematStage::Rematerializations
- /// and stores the achieved occupancy after remat in
- /// PreRARematStage::AchievedOcc.
- void rematerialize();
-
/// If remat alone did not increase occupancy to the target one, rollbacks all
/// rematerializations and resets live-ins/RP in all regions impacted by the
/// stage to their pre-stage values.
@@ -516,10 +755,17 @@ public:
bool initGCNRegion() override;
+ void finalizeGCNRegion() override;
+
bool shouldRevertScheduling(unsigned WavesAfter) override;
PreRARematStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
- : GCNSchedStage(StageID, DAG), RescheduleRegions(DAG.Regions.size()) {}
+ : GCNSchedStage(StageID, DAG), TargetRegions(DAG.Regions.size()),
+ RescheduleRegions(DAG.Regions.size()) {
+ const unsigned NumRegions = DAG.Regions.size();
+ RPTargets.reserve(NumRegions);
+ RegionBB.reserve(NumRegions);
+ }
};
class ILPInitialScheduleStage : public GCNSchedStage {
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index f291e37..da63628 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -119,15 +119,15 @@ GCNSubtarget &GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
// that do not support ADDR64 variants of MUBUF instructions. Such targets
// cannot use a 64 bit offset with a MUBUF instruction to access the global
// address space
- if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
- ToggleFeature(AMDGPU::FeatureFlatForGlobal);
- FlatForGlobal = true;
+ if (!hasAddr64() && !FS.contains("flat-for-global") && !UseFlatForGlobal) {
+ ToggleFeature(AMDGPU::FeatureUseFlatForGlobal);
+ UseFlatForGlobal = true;
}
// Unless +-flat-for-global is specified, use MUBUF instructions for global
// address space access if flat operations are not available.
- if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
- ToggleFeature(AMDGPU::FeatureFlatForGlobal);
- FlatForGlobal = false;
+ if (!hasFlat() && !FS.contains("flat-for-global") && UseFlatForGlobal) {
+ ToggleFeature(AMDGPU::FeatureUseFlatForGlobal);
+ UseFlatForGlobal = false;
}
// Set defaults if needed.
@@ -169,7 +169,6 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
: // clang-format off
AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
AMDGPUSubtarget(TT),
- TargetTriple(TT),
TargetID(*this),
InstrItins(getInstrItineraryForCPU(GPU)),
InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
@@ -645,6 +644,8 @@ void GCNSubtarget::adjustSchedDependency(
MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
unsigned Lat = 0;
for (++I; I != E && I->isBundledWithPred(); ++I) {
+ if (I->isMetaInstruction())
+ continue;
if (I->modifiesRegister(Reg, TRI))
Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
else if (Lat)
@@ -658,6 +659,8 @@ void GCNSubtarget::adjustSchedDependency(
MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
+ if (I->isMetaInstruction())
+ continue;
if (I->readsRegister(Reg, TRI))
break;
--Lat;
@@ -699,7 +702,7 @@ GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
KernargSegmentPtr = true;
bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
- if (IsAmdHsaOrMesa && !ST.enableFlatScratch())
+ if (IsAmdHsaOrMesa && !ST.hasFlatScratchEnabled())
PrivateSegmentBuffer = true;
else if (ST.isMesaGfxShader(F))
ImplicitBufferPtr = true;
@@ -717,13 +720,13 @@ GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
}
if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) &&
- (IsAmdHsaOrMesa || ST.enableFlatScratch()) &&
- // FlatScratchInit cannot be true for graphics CC if enableFlatScratch()
- // is false.
- (ST.enableFlatScratch() ||
+ (IsAmdHsaOrMesa || ST.hasFlatScratchEnabled()) &&
+ // FlatScratchInit cannot be true for graphics CC if
+ // hasFlatScratchEnabled() is false.
+ (ST.hasFlatScratchEnabled() ||
(!AMDGPU::isGraphics(CC) &&
!F.hasFnAttribute("amdgpu-no-flat-scratch-init"))) &&
- !ST.flatScratchIsArchitected()) {
+ !ST.hasArchitectedFlatScratch()) {
FlatScratchInit = true;
}
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index c2e6078..b308e0d 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -60,238 +60,25 @@ private:
protected:
// Basic subtarget description.
- Triple TargetTriple;
AMDGPU::IsaInfo::AMDGPUTargetID TargetID;
unsigned Gen = INVALID;
InstrItineraryData InstrItins;
int LDSBankCount = 0;
unsigned MaxPrivateElementSize = 0;
- // Possibly statically set by tablegen, but may want to be overridden.
- bool FastDenormalF32 = false;
- bool HalfRate64Ops = false;
- bool FullRate64Ops = false;
-
// Dynamically set bits that enable features.
- bool FlatForGlobal = false;
- bool AutoWaitcntBeforeBarrier = false;
- bool BackOffBarrier = false;
- bool UnalignedScratchAccess = false;
- bool UnalignedAccessMode = false;
- bool RelaxedBufferOOBMode = false;
- bool HasApertureRegs = false;
- bool SupportsXNACK = false;
- bool KernargPreload = false;
-
- // This should not be used directly. 'TargetID' tracks the dynamic settings
- // for XNACK.
- bool EnableXNACK = false;
-
- bool EnableTgSplit = false;
- bool EnableCuMode = false;
- bool TrapHandler = false;
- bool EnablePreciseMemory = false;
-
- // Used as options.
- bool EnableLoadStoreOpt = false;
- bool EnableUnsafeDSOffsetFolding = false;
- bool EnableSIScheduler = false;
- bool EnableDS128 = false;
- bool EnablePRTStrictNull = false;
- bool DumpCode = false;
- bool AssemblerPermissiveWavesize = false;
-
- // Subtarget statically properties set by tablegen
- bool FP64 = false;
- bool FMA = false;
- bool MIMG_R128 = false;
- bool CIInsts = false;
- bool GFX8Insts = false;
- bool GFX9Insts = false;
- bool GFX90AInsts = false;
- bool GFX940Insts = false;
- bool GFX950Insts = false;
- bool GFX10Insts = false;
- bool GFX11Insts = false;
- bool GFX12Insts = false;
- bool GFX1250Insts = false;
- bool GFX10_3Insts = false;
- bool GFX7GFX8GFX9Insts = false;
- bool SGPRInitBug = false;
- bool UserSGPRInit16Bug = false;
- bool NegativeScratchOffsetBug = false;
- bool NegativeUnalignedScratchOffsetBug = false;
- bool HasSMemRealTime = false;
- bool HasIntClamp = false;
- bool HasFmaMixInsts = false;
- bool HasFmaMixBF16Insts = false;
- bool HasMovrel = false;
- bool HasVGPRIndexMode = false;
- bool HasScalarDwordx3Loads = false;
- bool HasScalarStores = false;
- bool HasScalarAtomics = false;
- bool HasSDWAOmod = false;
- bool HasSDWAScalar = false;
- bool HasSDWASdst = false;
- bool HasSDWAMac = false;
- bool HasSDWAOutModsVOPC = false;
- bool HasDPP = false;
- bool HasDPP8 = false;
- bool HasDPALU_DPP = false;
- bool HasDPPSrc1SGPR = false;
- bool HasPackedFP32Ops = false;
- bool HasImageInsts = false;
- bool HasExtendedImageInsts = false;
- bool HasR128A16 = false;
- bool HasA16 = false;
- bool HasG16 = false;
- bool HasNSAEncoding = false;
- bool HasPartialNSAEncoding = false;
- bool GFX10_AEncoding = false;
- bool GFX10_BEncoding = false;
- bool HasDLInsts = false;
- bool HasFmacF64Inst = false;
- bool HasDot1Insts = false;
- bool HasDot2Insts = false;
- bool HasDot3Insts = false;
- bool HasDot4Insts = false;
- bool HasDot5Insts = false;
- bool HasDot6Insts = false;
- bool HasDot7Insts = false;
- bool HasDot8Insts = false;
- bool HasDot9Insts = false;
- bool HasDot10Insts = false;
- bool HasDot11Insts = false;
- bool HasDot12Insts = false;
- bool HasDot13Insts = false;
- bool HasMAIInsts = false;
- bool HasFP8Insts = false;
- bool HasFP8ConversionInsts = false;
- bool HasFP8E5M3Insts = false;
- bool HasCvtFP8Vop1Bug = false;
- bool HasPkFmacF16Inst = false;
- bool HasAtomicFMinFMaxF32GlobalInsts = false;
- bool HasAtomicFMinFMaxF64GlobalInsts = false;
- bool HasAtomicFMinFMaxF32FlatInsts = false;
- bool HasAtomicFMinFMaxF64FlatInsts = false;
- bool HasAtomicDsPkAdd16Insts = false;
- bool HasAtomicFlatPkAdd16Insts = false;
- bool HasAtomicFaddRtnInsts = false;
- bool HasAtomicFaddNoRtnInsts = false;
- bool HasMemoryAtomicFaddF32DenormalSupport = false;
- bool HasAtomicBufferGlobalPkAddF16NoRtnInsts = false;
- bool HasAtomicBufferGlobalPkAddF16Insts = false;
- bool HasAtomicCSubNoRtnInsts = false;
- bool HasAtomicGlobalPkAddBF16Inst = false;
- bool HasAtomicBufferPkAddBF16Inst = false;
- bool HasFlatAtomicFaddF32Inst = false;
- bool HasFlatBufferGlobalAtomicFaddF64Inst = false;
- bool HasDefaultComponentZero = false;
- bool HasAgentScopeFineGrainedRemoteMemoryAtomics = false;
- bool HasEmulatedSystemScopeAtomics = false;
- bool HasDefaultComponentBroadcast = false;
- bool HasXF32Insts = false;
+ bool DynamicVGPR = false;
+ bool DynamicVGPRBlockSize32 = false;
+ bool ScalarizeGlobal = false;
+
/// The maximum number of instructions that may be placed within an S_CLAUSE,
/// which is one greater than the maximum argument to S_CLAUSE. A value of 0
/// indicates a lack of S_CLAUSE support.
unsigned MaxHardClauseLength = 0;
- bool SupportsSRAMECC = false;
- bool DynamicVGPR = false;
- bool DynamicVGPRBlockSize32 = false;
- bool HasVMemToLDSLoad = false;
- bool RequiresAlignVGPR = false;
-
- // This should not be used directly. 'TargetID' tracks the dynamic settings
- // for SRAMECC.
- bool EnableSRAMECC = false;
-
- bool HasNoSdstCMPX = false;
- bool HasVscnt = false;
- bool HasWaitXcnt = false;
- bool HasGetWaveIdInst = false;
- bool HasSMemTimeInst = false;
- bool HasShaderCyclesRegister = false;
- bool HasShaderCyclesHiLoRegisters = false;
- bool HasVOP3Literal = false;
- bool HasNoDataDepHazard = false;
- bool FlatAddressSpace = false;
- bool FlatInstOffsets = false;
- bool FlatGlobalInsts = false;
- bool FlatScratchInsts = false;
- bool FlatGVSMode = false;
- bool ScalarFlatScratchInsts = false;
- bool HasArchitectedFlatScratch = false;
- bool EnableFlatScratch = false;
- bool HasArchitectedSGPRs = false;
- bool HasGDS = false;
- bool HasGWS = false;
- bool AddNoCarryInsts = false;
- bool HasUnpackedD16VMem = false;
- bool LDSMisalignedBug = false;
- bool HasMFMAInlineLiteralBug = false;
- bool UnalignedBufferAccess = false;
- bool UnalignedDSAccess = false;
- bool HasPackedTID = false;
- bool ScalarizeGlobal = false;
- bool HasSALUFloatInsts = false;
- bool HasPseudoScalarTrans = false;
- bool HasRestrictedSOffset = false;
- bool Has64BitLiterals = false;
- bool Has1024AddressableVGPRs = false;
- bool HasBitOp3Insts = false;
- bool HasTanhInsts = false;
- bool HasTensorCvtLutInsts = false;
- bool HasTransposeLoadF4F6Insts = false;
- bool HasPrngInst = false;
- bool HasBVHDualAndBVH8Insts = false;
- bool HasPermlane16Swap = false;
- bool HasPermlane32Swap = false;
- bool HasVcmpxPermlaneHazard = false;
- bool HasVMEMtoScalarWriteHazard = false;
- bool HasSMEMtoVectorWriteHazard = false;
- bool HasInstFwdPrefetchBug = false;
- bool HasVmemPrefInsts = false;
- bool HasSafeSmemPrefetch = false;
- bool HasSafeCUPrefetch = false;
- bool HasVcmpxExecWARHazard = false;
- bool HasLdsBranchVmemWARHazard = false;
- bool HasNSAtoVMEMBug = false;
- bool HasNSAClauseBug = false;
- bool HasOffset3fBug = false;
- bool HasFlatSegmentOffsetBug = false;
- bool HasImageStoreD16Bug = false;
- bool HasImageGather4D16Bug = false;
- bool HasMSAALoadDstSelBug = false;
- bool HasPrivEnabledTrap2NopBug = false;
- bool Has1_5xVGPRs = false;
- bool HasMADIntraFwdBug = false;
- bool HasVOPDInsts = false;
- bool HasVALUTransUseHazard = false;
- bool HasRequiredExportPriority = false;
- bool HasVmemWriteVgprInOrder = false;
- bool HasAshrPkInsts = false;
- bool HasIEEEMinimumMaximumInsts = false;
- bool HasMinimum3Maximum3F32 = false;
- bool HasMinimum3Maximum3F16 = false;
- bool HasMin3Max3PKF16 = false;
- bool HasMinimum3Maximum3PKF16 = false;
- bool HasLshlAddU64Inst = false;
- bool HasAddSubU64Insts = false;
- bool HasMadU32Inst = false;
- bool HasPointSampleAccel = false;
- bool HasLdsBarrierArriveAtomic = false;
- bool HasSetPrioIncWgInst = false;
-
- bool RequiresCOV6 = false;
- bool UseBlockVGPROpsForCSR = false;
- bool HasGloballyAddressableScratch = false;
-
- bool Has45BitNumRecordsBufferResource = false;
-
- bool HasClusters = false;
-
- // Dummy feature to use for assembler in tablegen.
- bool FeatureDisable = false;
+
+#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
+ bool ATTRIBUTE = DEFAULT;
+#include "AMDGPUGenSubtargetInfo.inc"
private:
SIInstrInfo InstrInfo;
@@ -303,24 +90,20 @@ public:
const GCNTargetMachine &TM);
~GCNSubtarget() override;
- GCNSubtarget &initializeSubtargetDependencies(const Triple &TT,
- StringRef GPU, StringRef FS);
+ GCNSubtarget &initializeSubtargetDependencies(const Triple &TT, StringRef GPU,
+ StringRef FS);
/// Diagnose inconsistent subtarget features before attempting to codegen
/// function \p F.
void checkSubtargetFeatures(const Function &F) const;
- const SIInstrInfo *getInstrInfo() const override {
- return &InstrInfo;
- }
+ const SIInstrInfo *getInstrInfo() const override { return &InstrInfo; }
const SIFrameLowering *getFrameLowering() const override {
return &FrameLowering;
}
- const SITargetLowering *getTargetLowering() const override {
- return &TLInfo;
- }
+ const SITargetLowering *getTargetLowering() const override { return &TLInfo; }
const SIRegisterInfo *getRegisterInfo() const override {
return &InstrInfo.getRegisterInfo();
@@ -358,9 +141,13 @@ public:
void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
- Generation getGeneration() const {
- return (Generation)Gen;
- }
+ Generation getGeneration() const { return (Generation)Gen; }
+
+ bool isGFX11Plus() const { return getGeneration() >= GFX11; }
+
+#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
+ bool GETTER() const override { return ATTRIBUTE; }
+#include "AMDGPUGenSubtargetInfo.inc"
unsigned getMaxWaveScratchSize() const {
// See COMPUTE_TMPRING_SIZE.WAVESIZE.
@@ -381,12 +168,11 @@ public:
return llvm::countl_zero(getMaxWaveScratchSize()) + getWavefrontSizeLog2();
}
- int getLDSBankCount() const {
- return LDSBankCount;
- }
+ int getLDSBankCount() const { return LDSBankCount; }
unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const {
- return (ForBufferRSrc || !enableFlatScratch()) ? MaxPrivateElementSize : 16;
+ return (ForBufferRSrc || !hasFlatScratchEnabled()) ? MaxPrivateElementSize
+ : 16;
}
unsigned getConstantBusLimit(unsigned Opcode) const;
@@ -397,34 +183,12 @@ public:
bool zeroesHigh16BitsOfDest(unsigned Opcode) const;
bool supportsWGP() const {
- if (GFX1250Insts)
+ if (HasGFX1250Insts)
return false;
return getGeneration() >= GFX10;
}
- bool hasIntClamp() const {
- return HasIntClamp;
- }
-
- bool hasFP64() const {
- return FP64;
- }
-
- bool hasMIMG_R128() const {
- return MIMG_R128;
- }
-
- bool hasHWFP64() const {
- return FP64;
- }
-
- bool hasHalfRate64Ops() const {
- return HalfRate64Ops;
- }
-
- bool hasFullRate64Ops() const {
- return FullRate64Ops;
- }
+ bool hasHWFP64() const { return HasFP64; }
bool hasAddr64() const {
return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
@@ -440,67 +204,19 @@ public:
return getGeneration() >= VOLCANIC_ISLANDS;
}
- bool hasFractBug() const {
- return getGeneration() == SOUTHERN_ISLANDS;
- }
-
- bool hasBFE() const {
- return true;
- }
-
- bool hasBFI() const {
- return true;
- }
-
- bool hasBFM() const {
- return hasBFE();
- }
-
- bool hasBCNT(unsigned Size) const {
- return true;
- }
+ bool hasFractBug() const { return getGeneration() == SOUTHERN_ISLANDS; }
- bool hasFFBL() const {
- return true;
- }
-
- bool hasFFBH() const {
- return true;
- }
-
- bool hasMed3_16() const {
- return getGeneration() >= AMDGPUSubtarget::GFX9;
- }
+ bool hasMed3_16() const { return getGeneration() >= AMDGPUSubtarget::GFX9; }
bool hasMin3Max3_16() const {
return getGeneration() >= AMDGPUSubtarget::GFX9;
}
- bool hasFmaMixInsts() const {
- return HasFmaMixInsts;
- }
+ bool hasSwap() const { return HasGFX9Insts; }
- bool hasFmaMixBF16Insts() const { return HasFmaMixBF16Insts; }
+ bool hasScalarPackInsts() const { return HasGFX9Insts; }
- bool hasCARRY() const {
- return true;
- }
-
- bool hasFMA() const {
- return FMA;
- }
-
- bool hasSwap() const {
- return GFX9Insts;
- }
-
- bool hasScalarPackInsts() const {
- return GFX9Insts;
- }
-
- bool hasScalarMulHiInsts() const {
- return GFX9Insts;
- }
+ bool hasScalarMulHiInsts() const { return HasGFX9Insts; }
bool hasScalarSubwordLoads() const { return getGeneration() >= GFX12; }
@@ -515,9 +231,7 @@ public:
/// True if the offset field of DS instructions works as expected. On SI, the
/// offset uses a 16-bit adder and does not always wrap properly.
- bool hasUsableDSOffset() const {
- return getGeneration() >= SEA_ISLANDS;
- }
+ bool hasUsableDSOffset() const { return getGeneration() >= SEA_ISLANDS; }
bool unsafeDSOffsetFoldingEnabled() const {
return EnableUnsafeDSOffsetFolding;
@@ -530,14 +244,10 @@ public:
/// Extra wait hazard is needed in some cases before
/// s_cbranch_vccnz/s_cbranch_vccz.
- bool hasReadVCCZBug() const {
- return getGeneration() <= SEA_ISLANDS;
- }
+ bool hasReadVCCZBug() const { return getGeneration() <= SEA_ISLANDS; }
/// Writes to VCC_LO/VCC_HI update the VCCZ flag.
- bool partialVCCWritesUpdateVCCZ() const {
- return getGeneration() >= GFX10;
- }
+ bool partialVCCWritesUpdateVCCZ() const { return getGeneration() >= GFX10; }
/// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
/// was written by a VALU instruction.
@@ -551,19 +261,13 @@ public:
return getGeneration() >= VOLCANIC_ISLANDS;
}
- bool hasRFEHazards() const {
- return getGeneration() >= VOLCANIC_ISLANDS;
- }
+ bool hasRFEHazards() const { return getGeneration() >= VOLCANIC_ISLANDS; }
/// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
unsigned getSetRegWaitStates() const {
return getGeneration() <= SEA_ISLANDS ? 1 : 2;
}
- bool dumpCode() const {
- return DumpCode;
- }
-
/// Return the amount of LDS that can be used that will not restrict the
/// occupancy lower than WaveCount.
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
@@ -578,25 +282,15 @@ public:
return getGeneration() >= AMDGPUSubtarget::GFX10;
}
- bool useFlatForGlobal() const {
- return FlatForGlobal;
- }
-
/// \returns If target supports ds_read/write_b128 and user enables generation
/// of ds_read/write_b128.
- bool useDS128() const {
- return CIInsts && EnableDS128;
- }
+ bool useDS128() const { return HasCIInsts && EnableDS128; }
/// \return If target supports ds_read/write_b96/128.
- bool hasDS96AndDS128() const {
- return CIInsts;
- }
+ bool hasDS96AndDS128() const { return HasCIInsts; }
/// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
- bool haveRoundOpsF64() const {
- return CIInsts;
- }
+ bool haveRoundOpsF64() const { return HasCIInsts; }
/// \returns If MUBUF instructions always perform range checking, even for
/// buffer resources used for private memory access.
@@ -606,91 +300,29 @@ public:
/// \returns If target requires PRT Struct NULL support (zero result registers
/// for sparse texture support).
- bool usePRTStrictNull() const {
- return EnablePRTStrictNull;
- }
-
- bool hasAutoWaitcntBeforeBarrier() const {
- return AutoWaitcntBeforeBarrier;
- }
-
- /// \returns true if the target supports backing off of s_barrier instructions
- /// when an exception is raised.
- bool supportsBackOffBarrier() const {
- return BackOffBarrier;
- }
-
- bool hasUnalignedBufferAccess() const {
- return UnalignedBufferAccess;
- }
+ bool usePRTStrictNull() const { return EnablePRTStrictNull; }
bool hasUnalignedBufferAccessEnabled() const {
- return UnalignedBufferAccess && UnalignedAccessMode;
- }
-
- bool hasUnalignedDSAccess() const {
- return UnalignedDSAccess;
+ return HasUnalignedBufferAccess && HasUnalignedAccessMode;
}
bool hasUnalignedDSAccessEnabled() const {
- return UnalignedDSAccess && UnalignedAccessMode;
- }
-
- bool hasUnalignedScratchAccess() const {
- return UnalignedScratchAccess;
+ return HasUnalignedDSAccess && HasUnalignedAccessMode;
}
bool hasUnalignedScratchAccessEnabled() const {
- return UnalignedScratchAccess && UnalignedAccessMode;
+ return HasUnalignedScratchAccess && HasUnalignedAccessMode;
}
- bool hasUnalignedAccessMode() const {
- return UnalignedAccessMode;
- }
-
- bool hasRelaxedBufferOOBMode() const { return RelaxedBufferOOBMode; }
-
- bool hasApertureRegs() const {
- return HasApertureRegs;
- }
+ bool isXNACKEnabled() const { return TargetID.isXnackOnOrAny(); }
- bool isTrapHandlerEnabled() const {
- return TrapHandler;
- }
+ bool isTgSplitEnabled() const { return EnableTgSplit; }
- bool isXNACKEnabled() const {
- return TargetID.isXnackOnOrAny();
- }
-
- bool isTgSplitEnabled() const {
- return EnableTgSplit;
- }
-
- bool isCuModeEnabled() const {
- return EnableCuMode;
- }
+ bool isCuModeEnabled() const { return EnableCuMode; }
bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; }
- bool hasFlatAddressSpace() const {
- return FlatAddressSpace;
- }
-
- bool hasFlatScrRegister() const {
- return hasFlatAddressSpace();
- }
-
- bool hasFlatInstOffsets() const {
- return FlatInstOffsets;
- }
-
- bool hasFlatGlobalInsts() const {
- return FlatGlobalInsts;
- }
-
- bool hasFlatScratchInsts() const {
- return FlatScratchInsts;
- }
+ bool hasFlatScrRegister() const { return hasFlatAddressSpace(); }
// Check if target supports ST addressing mode with FLAT scratch instructions.
// The ST addressing mode means no registers are used, either VGPR or SGPR,
@@ -699,24 +331,16 @@ public:
return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts());
}
- bool hasFlatScratchSVSMode() const { return GFX940Insts || GFX11Insts; }
+ bool hasFlatScratchSVSMode() const { return HasGFX940Insts || HasGFX11Insts; }
- bool hasScalarFlatScratchInsts() const {
- return ScalarFlatScratchInsts;
- }
-
- bool enableFlatScratch() const {
- return flatScratchIsArchitected() ||
+ bool hasFlatScratchEnabled() const {
+ return hasArchitectedFlatScratch() ||
(EnableFlatScratch && hasFlatScratchInsts());
}
- bool hasGlobalAddTidInsts() const {
- return GFX10_BEncoding;
- }
+ bool hasGlobalAddTidInsts() const { return HasGFX10_BEncoding; }
- bool hasAtomicCSub() const {
- return GFX10_BEncoding;
- }
+ bool hasAtomicCSub() const { return HasGFX10_BEncoding; }
bool hasMTBUFInsts() const { return !hasGFX1250Insts(); }
@@ -726,7 +350,9 @@ public:
return !hasGFX940Insts() && !hasGFX1250Insts();
}
- bool hasVINTERPEncoding() const { return GFX11Insts && !hasGFX1250Insts(); }
+ bool hasVINTERPEncoding() const {
+ return HasGFX11Insts && !hasGFX1250Insts();
+ }
// DS_ADD_F64/DS_ADD_RTN_F64
bool hasLdsAtomicAddF64() const {
@@ -737,274 +363,45 @@ public:
return getGeneration() >= GFX9;
}
- bool hasFlatSegmentOffsetBug() const {
- return HasFlatSegmentOffsetBug;
- }
+ bool hasFlatLgkmVMemCountInOrder() const { return getGeneration() > GFX9; }
- bool hasFlatLgkmVMemCountInOrder() const {
- return getGeneration() > GFX9;
- }
-
- bool hasD16LoadStore() const {
- return getGeneration() >= GFX9;
- }
+ bool hasD16LoadStore() const { return getGeneration() >= GFX9; }
bool d16PreservesUnusedBits() const {
return hasD16LoadStore() && !TargetID.isSramEccOnOrAny();
}
- bool hasD16Images() const {
- return getGeneration() >= VOLCANIC_ISLANDS;
- }
+ bool hasD16Images() const { return getGeneration() >= VOLCANIC_ISLANDS; }
/// Return if most LDS instructions have an m0 use that require m0 to be
/// initialized.
- bool ldsRequiresM0Init() const {
- return getGeneration() < GFX9;
- }
+ bool ldsRequiresM0Init() const { return getGeneration() < GFX9; }
// True if the hardware rewinds and replays GWS operations if a wave is
// preempted.
//
// If this is false, a GWS operation requires testing if a nack set the
// MEM_VIOL bit, and repeating if so.
- bool hasGWSAutoReplay() const {
- return getGeneration() >= GFX9;
- }
+ bool hasGWSAutoReplay() const { return getGeneration() >= GFX9; }
/// \returns if target has ds_gws_sema_release_all instruction.
- bool hasGWSSemaReleaseAll() const {
- return CIInsts;
- }
-
- /// \returns true if the target has integer add/sub instructions that do not
- /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32,
- /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier
- /// for saturation.
- bool hasAddNoCarry() const {
- return AddNoCarryInsts;
- }
+ bool hasGWSSemaReleaseAll() const { return HasCIInsts; }
bool hasScalarAddSub64() const { return getGeneration() >= GFX12; }
bool hasScalarSMulU64() const { return getGeneration() >= GFX12; }
- bool hasUnpackedD16VMem() const {
- return HasUnpackedD16VMem;
- }
-
// Covers VS/PS/CS graphics shaders
bool isMesaGfxShader(const Function &F) const {
return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv());
}
- bool hasMad64_32() const {
- return getGeneration() >= SEA_ISLANDS;
- }
-
- bool hasSDWAOmod() const {
- return HasSDWAOmod;
- }
-
- bool hasSDWAScalar() const {
- return HasSDWAScalar;
- }
-
- bool hasSDWASdst() const {
- return HasSDWASdst;
- }
-
- bool hasSDWAMac() const {
- return HasSDWAMac;
- }
-
- bool hasSDWAOutModsVOPC() const {
- return HasSDWAOutModsVOPC;
- }
-
- bool hasDLInsts() const {
- return HasDLInsts;
- }
-
- bool hasFmacF64Inst() const { return HasFmacF64Inst; }
-
- bool hasDot1Insts() const {
- return HasDot1Insts;
- }
-
- bool hasDot2Insts() const {
- return HasDot2Insts;
- }
-
- bool hasDot3Insts() const {
- return HasDot3Insts;
- }
-
- bool hasDot4Insts() const {
- return HasDot4Insts;
- }
-
- bool hasDot5Insts() const {
- return HasDot5Insts;
- }
-
- bool hasDot6Insts() const {
- return HasDot6Insts;
- }
-
- bool hasDot7Insts() const {
- return HasDot7Insts;
- }
-
- bool hasDot8Insts() const {
- return HasDot8Insts;
- }
-
- bool hasDot9Insts() const {
- return HasDot9Insts;
- }
-
- bool hasDot10Insts() const {
- return HasDot10Insts;
- }
-
- bool hasDot11Insts() const {
- return HasDot11Insts;
- }
-
- bool hasDot12Insts() const {
- return HasDot12Insts;
- }
-
- bool hasDot13Insts() const {
- return HasDot13Insts;
- }
-
- bool hasMAIInsts() const {
- return HasMAIInsts;
- }
-
- bool hasFP8Insts() const {
- return HasFP8Insts;
- }
-
- bool hasFP8ConversionInsts() const { return HasFP8ConversionInsts; }
-
- bool hasFP8E5M3Insts() const { return HasFP8E5M3Insts; }
-
- bool hasPkFmacF16Inst() const {
- return HasPkFmacF16Inst;
- }
-
- bool hasAtomicFMinFMaxF32GlobalInsts() const {
- return HasAtomicFMinFMaxF32GlobalInsts;
- }
-
- bool hasAtomicFMinFMaxF64GlobalInsts() const {
- return HasAtomicFMinFMaxF64GlobalInsts;
- }
-
- bool hasAtomicFMinFMaxF32FlatInsts() const {
- return HasAtomicFMinFMaxF32FlatInsts;
- }
-
- bool hasAtomicFMinFMaxF64FlatInsts() const {
- return HasAtomicFMinFMaxF64FlatInsts;
- }
-
- bool hasAtomicDsPkAdd16Insts() const { return HasAtomicDsPkAdd16Insts; }
-
- bool hasAtomicFlatPkAdd16Insts() const { return HasAtomicFlatPkAdd16Insts; }
+ bool hasMad64_32() const { return getGeneration() >= SEA_ISLANDS; }
bool hasAtomicFaddInsts() const {
return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts;
}
- bool hasAtomicFaddRtnInsts() const { return HasAtomicFaddRtnInsts; }
-
- bool hasAtomicFaddNoRtnInsts() const { return HasAtomicFaddNoRtnInsts; }
-
- bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const {
- return HasAtomicBufferGlobalPkAddF16NoRtnInsts;
- }
-
- bool hasAtomicBufferGlobalPkAddF16Insts() const {
- return HasAtomicBufferGlobalPkAddF16Insts;
- }
-
- bool hasAtomicGlobalPkAddBF16Inst() const {
- return HasAtomicGlobalPkAddBF16Inst;
- }
-
- bool hasAtomicBufferPkAddBF16Inst() const {
- return HasAtomicBufferPkAddBF16Inst;
- }
-
- bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; }
-
- /// \return true if the target has flat, global, and buffer atomic fadd for
- /// double.
- bool hasFlatBufferGlobalAtomicFaddF64Inst() const {
- return HasFlatBufferGlobalAtomicFaddF64Inst;
- }
-
- /// \return true if the target's flat, global, and buffer atomic fadd for
- /// float supports denormal handling.
- bool hasMemoryAtomicFaddF32DenormalSupport() const {
- return HasMemoryAtomicFaddF32DenormalSupport;
- }
-
- /// \return true if atomic operations targeting fine-grained memory work
- /// correctly at device scope, in allocations in host or peer PCIe device
- /// memory.
- bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const {
- return HasAgentScopeFineGrainedRemoteMemoryAtomics;
- }
-
- /// \return true is HW emulates system scope atomics unsupported by the PCI-e
- /// via CAS loop.
- bool hasEmulatedSystemScopeAtomics() const {
- return HasEmulatedSystemScopeAtomics;
- }
-
- bool hasDefaultComponentZero() const { return HasDefaultComponentZero; }
-
- bool hasDefaultComponentBroadcast() const {
- return HasDefaultComponentBroadcast;
- }
-
- bool hasNoSdstCMPX() const {
- return HasNoSdstCMPX;
- }
-
- bool hasVscnt() const {
- return HasVscnt;
- }
-
- bool hasGetWaveIdInst() const {
- return HasGetWaveIdInst;
- }
-
- bool hasSMemTimeInst() const {
- return HasSMemTimeInst;
- }
-
- bool hasShaderCyclesRegister() const {
- return HasShaderCyclesRegister;
- }
-
- bool hasShaderCyclesHiLoRegisters() const {
- return HasShaderCyclesHiLoRegisters;
- }
-
- bool hasVOP3Literal() const {
- return HasVOP3Literal;
- }
-
- bool hasNoDataDepHazard() const {
- return HasNoDataDepHazard;
- }
-
bool vmemWriteNeedsExpWaitcnt() const {
return getGeneration() < SEA_ISLANDS;
}
@@ -1013,13 +410,7 @@ public:
return getGeneration() == GFX10 || getGeneration() == GFX11;
}
- bool hasPrefetch() const { return GFX12Insts; }
-
- bool hasVmemPrefInsts() const { return HasVmemPrefInsts; }
-
- bool hasSafeSmemPrefetch() const { return HasSafeSmemPrefetch; }
-
- bool hasSafeCUPrefetch() const { return HasSafeCUPrefetch; }
+ bool hasPrefetch() const { return HasGFX12Insts; }
// Has s_cmpk_* instructions.
bool hasSCmpK() const { return getGeneration() < GFX12; }
@@ -1034,15 +425,11 @@ public:
// dynamic realignment in common cases.
Align getStackAlignment() const { return Align(16); }
- bool enableMachineScheduler() const override {
- return true;
- }
+ bool enableMachineScheduler() const override { return true; }
bool useAA() const override;
- bool enableSubRegLiveness() const override {
- return true;
- }
+ bool enableSubRegLiveness() const override { return true; }
void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
@@ -1051,9 +438,7 @@ public:
static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
// XXX - Why is this here if it isn't in the default pass set?
- bool enableEarlyIfConversion() const override {
- return true;
- }
+ bool enableEarlyIfConversion() const override { return true; }
void overrideSchedPolicy(MachineSchedPolicy &Policy,
const SchedRegion &Region) const override;
@@ -1067,73 +452,35 @@ public:
return AMDGPU::getMaxNumUserSGPRs(*this);
}
- bool hasSMemRealTime() const {
- return HasSMemRealTime;
- }
-
- bool hasMovrel() const {
- return HasMovrel;
- }
-
- bool hasVGPRIndexMode() const {
- return HasVGPRIndexMode;
- }
-
bool useVGPRIndexMode() const;
bool hasScalarCompareEq64() const {
return getGeneration() >= VOLCANIC_ISLANDS;
}
- bool hasScalarDwordx3Loads() const { return HasScalarDwordx3Loads; }
-
- bool hasScalarStores() const {
- return HasScalarStores;
- }
-
- bool hasScalarAtomics() const {
- return HasScalarAtomics;
+ bool hasLDSFPAtomicAddF32() const { return HasGFX8Insts; }
+ bool hasLDSFPAtomicAddF64() const {
+ return HasGFX90AInsts || HasGFX1250Insts;
}
- bool hasLDSFPAtomicAddF32() const { return GFX8Insts; }
- bool hasLDSFPAtomicAddF64() const { return GFX90AInsts || GFX1250Insts; }
-
/// \returns true if the subtarget has the v_permlanex16_b32 instruction.
bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
/// \returns true if the subtarget has the v_permlane64_b32 instruction.
bool hasPermLane64() const { return getGeneration() >= GFX11; }
- bool hasDPP() const {
- return HasDPP;
- }
-
- bool hasDPPBroadcasts() const {
- return HasDPP && getGeneration() < GFX10;
- }
+ bool hasDPPBroadcasts() const { return HasDPP && getGeneration() < GFX10; }
bool hasDPPWavefrontShifts() const {
return HasDPP && getGeneration() < GFX10;
}
- bool hasDPP8() const {
- return HasDPP8;
- }
-
- bool hasDPALU_DPP() const {
- return HasDPALU_DPP;
- }
-
- bool hasDPPSrc1SGPR() const { return HasDPPSrc1SGPR; }
-
- bool hasPackedFP32Ops() const {
- return HasPackedFP32Ops;
+ bool hasDPPRowShare() const {
+ return HasDPP && (HasGFX90AInsts || getGeneration() >= GFX10);
}
// Has V_PK_MOV_B32 opcode
- bool hasPkMovB32() const {
- return GFX90AInsts;
- }
+ bool hasPkMovB32() const { return HasGFX90AInsts; }
bool hasFmaakFmamkF32Insts() const {
return getGeneration() >= GFX10 || hasGFX940Insts();
@@ -1141,96 +488,26 @@ public:
bool hasFmaakFmamkF64Insts() const { return hasGFX1250Insts(); }
- bool hasImageInsts() const {
- return HasImageInsts;
- }
-
- bool hasExtendedImageInsts() const {
- return HasExtendedImageInsts;
- }
-
- bool hasR128A16() const {
- return HasR128A16;
- }
-
- bool hasA16() const { return HasA16; }
-
- bool hasG16() const { return HasG16; }
-
- bool hasOffset3fBug() const {
- return HasOffset3fBug;
- }
-
- bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; }
-
- bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; }
-
- bool hasMADIntraFwdBug() const { return HasMADIntraFwdBug; }
-
- bool hasMSAALoadDstSelBug() const { return HasMSAALoadDstSelBug; }
-
- bool hasPrivEnabledTrap2NopBug() const { return HasPrivEnabledTrap2NopBug; }
-
- bool hasNSAEncoding() const { return HasNSAEncoding; }
-
bool hasNonNSAEncoding() const { return getGeneration() < GFX12; }
- bool hasPartialNSAEncoding() const { return HasPartialNSAEncoding; }
-
unsigned getNSAMaxSize(bool HasSampler = false) const {
return AMDGPU::getNSAMaxSize(*this, HasSampler);
}
- bool hasGFX10_AEncoding() const {
- return GFX10_AEncoding;
- }
-
- bool hasGFX10_BEncoding() const {
- return GFX10_BEncoding;
- }
-
- bool hasGFX10_3Insts() const {
- return GFX10_3Insts;
- }
-
bool hasMadF16() const;
- bool hasMovB64() const { return GFX940Insts || GFX1250Insts; }
-
- bool hasLshlAddU64Inst() const { return HasLshlAddU64Inst; }
+ bool hasMovB64() const { return HasGFX940Insts || HasGFX1250Insts; }
// Scalar and global loads support scale_offset bit.
- bool hasScaleOffset() const { return GFX1250Insts; }
-
- bool hasFlatGVSMode() const { return FlatGVSMode; }
+ bool hasScaleOffset() const { return HasGFX1250Insts; }
// FLAT GLOBAL VOffset is signed
- bool hasSignedGVSOffset() const { return GFX1250Insts; }
+ bool hasSignedGVSOffset() const { return HasGFX1250Insts; }
- bool enableSIScheduler() const {
- return EnableSIScheduler;
- }
+ bool loadStoreOptEnabled() const { return EnableLoadStoreOpt; }
- bool loadStoreOptEnabled() const {
- return EnableLoadStoreOpt;
- }
-
- bool hasSGPRInitBug() const {
- return SGPRInitBug;
- }
-
- bool hasUserSGPRInit16Bug() const {
- return UserSGPRInit16Bug && isWave32();
- }
-
- bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; }
-
- bool hasNegativeUnalignedScratchOffsetBug() const {
- return NegativeUnalignedScratchOffsetBug;
- }
-
- bool hasMFMAInlineLiteralBug() const {
- return HasMFMAInlineLiteralBug;
+ bool hasUserSGPRInit16BugInWave32() const {
+ return HasUserSGPRInit16Bug && isWave32();
}
bool has12DWordStoreHazard() const {
@@ -1238,9 +515,7 @@ public:
}
// \returns true if the subtarget supports DWORDX3 load/store instructions.
- bool hasDwordx3LoadStores() const {
- return CIInsts;
- }
+ bool hasDwordx3LoadStores() const { return HasCIInsts; }
bool hasReadM0MovRelInterpHazard() const {
return getGeneration() == AMDGPUSubtarget::GFX9;
@@ -1259,66 +534,32 @@ public:
return getGeneration() == AMDGPUSubtarget::GFX9;
}
- bool hasVcmpxPermlaneHazard() const {
- return HasVcmpxPermlaneHazard;
- }
-
- bool hasVMEMtoScalarWriteHazard() const {
- return HasVMEMtoScalarWriteHazard;
- }
-
- bool hasSMEMtoVectorWriteHazard() const {
- return HasSMEMtoVectorWriteHazard;
- }
-
- bool hasLDSMisalignedBug() const {
- return LDSMisalignedBug && !EnableCuMode;
- }
-
- bool hasInstFwdPrefetchBug() const {
- return HasInstFwdPrefetchBug;
- }
-
- bool hasVcmpxExecWARHazard() const {
- return HasVcmpxExecWARHazard;
- }
-
- bool hasLdsBranchVmemWARHazard() const {
- return HasLdsBranchVmemWARHazard;
+ bool hasLDSMisalignedBugInWGPMode() const {
+ return HasLDSMisalignedBug && !EnableCuMode;
}
// Shift amount of a 64 bit shift cannot be a highest allocated register
// if also at the end of the allocation block.
bool hasShift64HighRegBug() const {
- return GFX90AInsts && !GFX940Insts;
+ return HasGFX90AInsts && !HasGFX940Insts;
}
// Has one cycle hazard on transcendental instruction feeding a
// non transcendental VALU.
- bool hasTransForwardingHazard() const { return GFX940Insts; }
+ bool hasTransForwardingHazard() const { return HasGFX940Insts; }
// Has one cycle hazard on a VALU instruction partially writing dst with
// a shift of result bits feeding another VALU instruction.
- bool hasDstSelForwardingHazard() const { return GFX940Insts; }
+ bool hasDstSelForwardingHazard() const { return HasGFX940Insts; }
// Cannot use op_sel with v_dot instructions.
- bool hasDOTOpSelHazard() const { return GFX940Insts || GFX11Insts; }
+ bool hasDOTOpSelHazard() const { return HasGFX940Insts || HasGFX11Insts; }
// Does not have HW interlocs for VALU writing and then reading SGPRs.
- bool hasVDecCoExecHazard() const {
- return GFX940Insts;
- }
-
- bool hasNSAtoVMEMBug() const {
- return HasNSAtoVMEMBug;
- }
-
- bool hasNSAClauseBug() const { return HasNSAClauseBug; }
+ bool hasVDecCoExecHazard() const { return HasGFX940Insts; }
bool hasHardClauses() const { return MaxHardClauseLength > 0; }
- bool hasGFX90AInsts() const { return GFX90AInsts; }
-
bool hasFPAtomicToDenormModeHazard() const {
return getGeneration() == GFX10;
}
@@ -1333,77 +574,45 @@ public:
return getGeneration() == GFX11;
}
- bool hasVALUTransUseHazard() const { return HasVALUTransUseHazard; }
-
- bool hasCvtScaleForwardingHazard() const { return GFX950Insts; }
+ bool hasCvtScaleForwardingHazard() const { return HasGFX950Insts; }
bool requiresCodeObjectV6() const { return RequiresCOV6; }
bool useVGPRBlockOpsForCSR() const { return UseBlockVGPROpsForCSR; }
- bool hasGloballyAddressableScratch() const {
- return HasGloballyAddressableScratch;
- }
-
bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
- bool hasVALUReadSGPRHazard() const { return GFX12Insts && !GFX1250Insts; }
+ bool hasVALUReadSGPRHazard() const {
+ return HasGFX12Insts && !HasGFX1250Insts;
+ }
bool setRegModeNeedsVNOPs() const {
- return GFX1250Insts && getGeneration() == GFX12;
+ return HasGFX1250Insts && getGeneration() == GFX12;
}
/// Return if operations acting on VGPR tuples require even alignment.
bool needsAlignedVGPRs() const { return RequiresAlignVGPR; }
/// Return true if the target has the S_PACK_HL_B32_B16 instruction.
- bool hasSPackHL() const { return GFX11Insts; }
+ bool hasSPackHL() const { return HasGFX11Insts; }
/// Return true if the target's EXP instruction has the COMPR flag, which
/// affects the meaning of the EN (enable) bits.
- bool hasCompressedExport() const { return !GFX11Insts; }
+ bool hasCompressedExport() const { return !HasGFX11Insts; }
/// Return true if the target's EXP instruction supports the NULL export
/// target.
- bool hasNullExportTarget() const { return !GFX11Insts; }
-
- bool has1_5xVGPRs() const { return Has1_5xVGPRs; }
-
- bool hasVOPDInsts() const { return HasVOPDInsts; }
+ bool hasNullExportTarget() const { return !HasGFX11Insts; }
bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; }
/// Return true if the target has the S_DELAY_ALU instruction.
- bool hasDelayAlu() const { return GFX11Insts; }
-
- bool hasPackedTID() const { return HasPackedTID; }
-
- // GFX94* is a derivation to GFX90A. hasGFX940Insts() being true implies that
- // hasGFX90AInsts is also true.
- bool hasGFX940Insts() const { return GFX940Insts; }
-
- // GFX950 is a derivation to GFX94*. hasGFX950Insts() implies that
- // hasGFX940Insts and hasGFX90AInsts are also true.
- bool hasGFX950Insts() const { return GFX950Insts; }
+ bool hasDelayAlu() const { return HasGFX11Insts; }
/// Returns true if the target supports
/// global_load_lds_dwordx3/global_load_lds_dwordx4 or
/// buffer_load_dwordx3/buffer_load_dwordx4 with the lds bit.
- bool hasLDSLoadB96_B128() const {
- return hasGFX950Insts();
- }
-
- bool hasVMemToLDSLoad() const { return HasVMemToLDSLoad; }
-
- bool hasSALUFloatInsts() const { return HasSALUFloatInsts; }
-
- bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; }
-
- bool hasRestrictedSOffset() const { return HasRestrictedSOffset; }
-
- bool hasRequiredExportPriority() const { return HasRequiredExportPriority; }
-
- bool hasVmemWriteVgprInOrder() const { return HasVmemWriteVgprInOrder; }
+ bool hasLDSLoadB96_B128() const { return hasGFX950Insts(); }
/// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt
/// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively.
@@ -1415,59 +624,25 @@ public:
return getGeneration() == GFX12;
}
- /// \returns true if the target has instructions with xf32 format support.
- bool hasXF32Insts() const { return HasXF32Insts; }
-
- bool hasBitOp3Insts() const { return HasBitOp3Insts; }
-
- bool hasPermlane16Swap() const { return HasPermlane16Swap; }
- bool hasPermlane32Swap() const { return HasPermlane32Swap; }
- bool hasAshrPkInsts() const { return HasAshrPkInsts; }
-
- bool hasMinimum3Maximum3F32() const {
- return HasMinimum3Maximum3F32;
- }
-
- bool hasMinimum3Maximum3F16() const {
- return HasMinimum3Maximum3F16;
+ /// \returns true if the target has packed f32 instructions that only read 32
+ /// bits from a scalar operand (SGPR or literal) and replicates the bits to
+ /// both channels.
+ bool hasPKF32InstsReplicatingLower32BitsOfScalarInput() const {
+ return getGeneration() == GFX12 && HasGFX1250Insts;
}
- bool hasMin3Max3PKF16() const { return HasMin3Max3PKF16; }
-
- bool hasTanhInsts() const { return HasTanhInsts; }
-
- bool hasTensorCvtLutInsts() const { return HasTensorCvtLutInsts; }
-
- bool hasAddPC64Inst() const { return GFX1250Insts; }
-
- bool has1024AddressableVGPRs() const { return Has1024AddressableVGPRs; }
-
- bool hasMinimum3Maximum3PKF16() const {
- return HasMinimum3Maximum3PKF16;
- }
-
- bool hasTransposeLoadF4F6Insts() const { return HasTransposeLoadF4F6Insts; }
-
- /// \returns true if the target has s_wait_xcnt insertion. Supported for
- /// GFX1250.
- bool hasWaitXCnt() const { return HasWaitXcnt; }
+ bool hasAddPC64Inst() const { return HasGFX1250Insts; }
- // A single DWORD instructions can use a 64-bit literal.
- bool has64BitLiterals() const { return Has64BitLiterals; }
-
- bool hasPointSampleAccel() const { return HasPointSampleAccel; }
-
- bool hasLdsBarrierArriveAtomic() const { return HasLdsBarrierArriveAtomic; }
+ /// \returns true if the target supports expert scheduling mode 2 which relies
+ /// on the compiler to insert waits to avoid hazards between VMEM and VALU
+ /// instructions in some instances.
+ bool hasExpertSchedulingMode() const { return getGeneration() >= GFX12; }
/// \returns The maximum number of instructions that can be enclosed in an
/// S_CLAUSE on the given subtarget, or 0 for targets that do not support that
/// instruction.
unsigned maxHardClauseLength() const { return MaxHardClauseLength; }
- bool hasPrngInst() const { return HasPrngInst; }
-
- bool hasBVHDualAndBVH8Insts() const { return HasBVHDualAndBVH8Insts; }
-
/// Return the maximum number of waves per SIMD for kernels using \p SGPRs
/// SGPRs
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
@@ -1496,50 +671,22 @@ public:
return getGeneration() >= AMDGPUSubtarget::GFX9;
}
- /// \returns true if the flat_scratch register is initialized by the HW.
- /// In this case it is readonly.
- bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; }
-
- /// \returns true if the architected SGPRs are enabled.
- bool hasArchitectedSGPRs() const { return HasArchitectedSGPRs; }
-
- /// \returns true if Global Data Share is supported.
- bool hasGDS() const { return HasGDS; }
-
- /// \returns true if Global Wave Sync is supported.
- bool hasGWS() const { return HasGWS; }
-
/// \returns true if the machine has merged shaders in which s0-s7 are
/// reserved by the hardware and user SGPRs start at s8
- bool hasMergedShaders() const {
- return getGeneration() >= GFX9;
- }
+ bool hasMergedShaders() const { return getGeneration() >= GFX9; }
// \returns true if the target supports the pre-NGG legacy geometry path.
bool hasLegacyGeometry() const { return getGeneration() < GFX11; }
- // \returns true if preloading kernel arguments is supported.
- bool hasKernargPreload() const { return KernargPreload; }
-
// \returns true if the target has split barriers feature
bool hasSplitBarriers() const { return getGeneration() >= GFX12; }
- // \returns true if FP8/BF8 VOP1 form of conversion to F32 is unreliable.
- bool hasCvtFP8VOP1Bug() const { return HasCvtFP8Vop1Bug; }
-
- // \returns true if CSUB (a.k.a. SUB_CLAMP on GFX12) atomics support a
- // no-return form.
- bool hasAtomicCSubNoRtnInsts() const { return HasAtomicCSubNoRtnInsts; }
-
// \returns true if the target has DX10_CLAMP kernel descriptor mode bit
bool hasDX10ClampMode() const { return getGeneration() < GFX12; }
// \returns true if the target has IEEE kernel descriptor mode bit
bool hasIEEEMode() const { return getGeneration() < GFX12; }
- // \returns true if the target has IEEE fminimum/fmaximum instructions
- bool hasIEEEMinimumMaximumInsts() const { return HasIEEEMinimumMaximumInsts; }
-
// \returns true if the target has WG_RR_MODE kernel descriptor mode bit
bool hasRrWGMode() const { return getGeneration() >= GFX12; }
@@ -1547,52 +694,43 @@ public:
/// values.
bool hasSignedScratchOffsets() const { return getGeneration() >= GFX12; }
- bool hasGFX1250Insts() const { return GFX1250Insts; }
+ bool hasINVWBL2WaitCntRequirement() const { return HasGFX1250Insts; }
- bool hasVOPD3() const { return GFX1250Insts; }
-
- // \returns true if the target has V_ADD_U64/V_SUB_U64 instructions.
- bool hasAddSubU64Insts() const { return HasAddSubU64Insts; }
-
- // \returns true if the target has V_MAD_U32 instruction.
- bool hasMadU32Inst() const { return HasMadU32Inst; }
+ bool hasVOPD3() const { return HasGFX1250Insts; }
// \returns true if the target has V_MUL_U64/V_MUL_I64 instructions.
- bool hasVectorMulU64() const { return GFX1250Insts; }
+ bool hasVectorMulU64() const { return HasGFX1250Insts; }
// \returns true if the target has V_MAD_NC_U64_U32/V_MAD_NC_I64_I32
// instructions.
- bool hasMadU64U32NoCarry() const { return GFX1250Insts; }
+ bool hasMadU64U32NoCarry() const { return HasGFX1250Insts; }
// \returns true if the target has V_{MIN|MAX}_{I|U}64 instructions.
- bool hasIntMinMax64() const { return GFX1250Insts; }
-
- // \returns true if the target has V_ADD_{MIN|MAX}_{I|U}32 instructions.
- bool hasAddMinMaxInsts() const { return GFX1250Insts; }
-
- // \returns true if the target has V_PK_ADD_{MIN|MAX}_{I|U}16 instructions.
- bool hasPkAddMinMaxInsts() const { return GFX1250Insts; }
+ bool hasIntMinMax64() const { return HasGFX1250Insts; }
// \returns true if the target has V_PK_{MIN|MAX}3_{I|U}16 instructions.
- bool hasPkMinMax3Insts() const { return GFX1250Insts; }
+ bool hasPkMinMax3Insts() const { return HasGFX1250Insts; }
// \returns ture if target has S_GET_SHADER_CYCLES_U64 instruction.
- bool hasSGetShaderCyclesInst() const { return GFX1250Insts; }
-
- // \returns true if target has S_SETPRIO_INC_WG instruction.
- bool hasSetPrioIncWgInst() const { return HasSetPrioIncWgInst; }
+ bool hasSGetShaderCyclesInst() const { return HasGFX1250Insts; }
// \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead
// of sign-extending. Note that GFX1250 has not only fixed the bug but also
// extended VA to 57 bits.
- bool hasGetPCZeroExtension() const { return GFX12Insts && !GFX1250Insts; }
+ bool hasGetPCZeroExtension() const {
+ return HasGFX12Insts && !HasGFX1250Insts;
+ }
// \returns true if the target needs to create a prolog for backward
// compatibility when preloading kernel arguments.
bool needsKernArgPreloadProlog() const {
- return hasKernargPreload() && !GFX1250Insts;
+ return hasKernargPreload() && !HasGFX1250Insts;
}
+ bool hasCondSubInsts() const { return HasGFX12Insts; }
+
+ bool hasSubClampInsts() const { return hasGFX10_3Insts(); }
+
/// \returns SGPR allocation granularity supported by the subtarget.
unsigned getSGPRAllocGranule() const {
return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
@@ -1724,9 +862,7 @@ public:
/// unit requirement.
unsigned getMaxNumVGPRs(const Function &F) const;
- unsigned getMaxNumAGPRs(const Function &F) const {
- return getMaxNumVGPRs(F);
- }
+ unsigned getMaxNumAGPRs(const Function &F) const { return getMaxNumVGPRs(F); }
/// Return a pair of maximum numbers of VGPRs and AGPRs that meet the number
/// of waves per execution unit required for the function \p MF.
@@ -1746,13 +882,9 @@ public:
bool supportsWave64() const { return !hasGFX1250Insts(); }
- bool isWave32() const {
- return getWavefrontSize() == 32;
- }
+ bool isWave32() const { return getWavefrontSize() == 32; }
- bool isWave64() const {
- return getWavefrontSize() == 64;
- }
+ bool isWave64() const { return getWavefrontSize() == 64; }
/// Returns if the wavesize of this subtarget is known reliable. This is false
/// only for the a default target-cpu that does not have an explicit
@@ -1809,11 +941,11 @@ public:
// \returns true if the subtarget has a hazard requiring an "s_nop 0"
// instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)".
- bool requiresNopBeforeDeallocVGPRs() const { return !GFX1250Insts; }
+ bool requiresNopBeforeDeallocVGPRs() const { return !HasGFX1250Insts; }
// \returns true if the subtarget needs S_WAIT_ALU 0 before S_GETREG_B32 on
// STATUS, STATE_PRIV, EXCP_FLAG_PRIV, or EXCP_FLAG_USER.
- bool requiresWaitIdleBeforeGetReg() const { return GFX1250Insts; }
+ bool requiresWaitIdleBeforeGetReg() const { return HasGFX1250Insts; }
bool isDynamicVGPREnabled() const { return DynamicVGPR; }
unsigned getDynamicVGPRBlockSize() const {
@@ -1835,15 +967,21 @@ public:
// Requires s_wait_alu(0) after s102/s103 write and src_flat_scratch_base
// read.
bool hasScratchBaseForwardingHazard() const {
- return GFX1250Insts && getGeneration() == GFX12;
+ return HasGFX1250Insts && getGeneration() == GFX12;
}
- /// \returns true if the subtarget supports clusters of workgroups.
- bool hasClusters() const { return HasClusters; }
+ // src_flat_scratch_hi cannot be used as a source in SALU producing a 64-bit
+ // result.
+ bool hasFlatScratchHiInB64InstHazard() const {
+ return HasGFX1250Insts && getGeneration() == GFX12;
+ }
- /// \returns true if the subtarget requires a wait for xcnt before atomic
- /// flat/global stores & rmw.
- bool requiresWaitXCntBeforeAtomicStores() const { return GFX1250Insts; }
+ /// \returns true if the subtarget requires a wait for xcnt before VMEM
+ /// accesses that must never be repeated in the event of a page fault/re-try.
+ /// Atomic stores/rmw and all volatile accesses fall under this criteria.
+ bool requiresWaitXCntForSingleAccessInstructions() const {
+ return HasGFX1250Insts;
+ }
/// \returns the number of significant bits in the immediate field of the
/// S_NOP instruction.
@@ -1855,10 +993,28 @@ public:
return 3;
}
- /// \returns true if the sub-target supports buffer resource (V#) with 45-bit
- /// num_records.
- bool has45BitNumRecordsBufferResource() const {
- return Has45BitNumRecordsBufferResource;
+ bool supportsBPermute() const {
+ return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS;
+ }
+
+ bool supportsWaveWideBPermute() const {
+ return (getGeneration() <= AMDGPUSubtarget::GFX9 ||
+ getGeneration() == AMDGPUSubtarget::GFX12) ||
+ isWave32();
+ }
+
+ /// Return true if real (non-fake) variants of True16 instructions using
+ /// 16-bit registers should be code-generated. Fake True16 instructions are
+ /// identical to non-fake ones except that they take 32-bit registers as
+ /// operands and always use their low halves.
+ // TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully
+ // supported and the support for fake True16 instructions is removed.
+ bool useRealTrue16Insts() const {
+ return hasTrue16BitInsts() && EnableRealTrue16Insts;
+ }
+
+ bool requiresWaitOnWorkgroupReleaseFence() const {
+ return getGeneration() >= GFX10 || isTgSplitEnabled();
}
};
diff --git a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
index 9e66909..663f538 100644
--- a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
@@ -35,18 +35,18 @@ using namespace llvm;
#define DEBUG_TYPE "gcn-vopd-utils"
bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII,
- const MachineInstr &FirstMI,
- const MachineInstr &SecondMI, bool IsVOPD3) {
+ const MachineInstr &MIX,
+ const MachineInstr &MIY, bool IsVOPD3) {
namespace VOPD = AMDGPU::VOPD;
- const MachineFunction *MF = FirstMI.getMF();
+ const MachineFunction *MF = MIX.getMF();
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
if (IsVOPD3 && !ST.hasVOPD3())
return false;
- if (!IsVOPD3 && (TII.isVOP3(FirstMI) || TII.isVOP3(SecondMI)))
+ if (!IsVOPD3 && (TII.isVOP3(MIX) || TII.isVOP3(MIY)))
return false;
- if (TII.isDPP(FirstMI) || TII.isDPP(SecondMI))
+ if (TII.isDPP(MIX) || TII.isDPP(MIY))
return false;
const SIRegisterInfo *TRI = dyn_cast<SIRegisterInfo>(ST.getRegisterInfo());
@@ -61,32 +61,24 @@ bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII,
UniqueLiterals.push_back(&Op);
};
SmallVector<Register> UniqueScalarRegs;
- assert([&]() -> bool {
- for (auto MII = MachineBasicBlock::const_iterator(&FirstMI);
- MII != FirstMI.getParent()->instr_end(); ++MII) {
- if (&*MII == &SecondMI)
- return true;
- }
- return false;
- }() && "Expected FirstMI to precede SecondMI");
- // Cannot pair dependent instructions
- for (const auto &Use : SecondMI.uses())
- if (Use.isReg() && FirstMI.modifiesRegister(Use.getReg(), TRI))
+
+ // MIX must not modify any registers used by MIY.
+ for (const auto &Use : MIY.uses())
+ if (Use.isReg() && MIX.modifiesRegister(Use.getReg(), TRI))
return false;
auto getVRegIdx = [&](unsigned OpcodeIdx, unsigned OperandIdx) {
- const MachineInstr &MI = (OpcodeIdx == VOPD::X) ? FirstMI : SecondMI;
+ const MachineInstr &MI = (OpcodeIdx == VOPD::X) ? MIX : MIY;
const MachineOperand &Operand = MI.getOperand(OperandIdx);
if (Operand.isReg() && TRI->isVectorRegister(MRI, Operand.getReg()))
return Operand.getReg();
return Register();
};
- auto InstInfo =
- AMDGPU::getVOPDInstInfo(FirstMI.getDesc(), SecondMI.getDesc());
+ auto InstInfo = AMDGPU::getVOPDInstInfo(MIX.getDesc(), MIY.getDesc());
for (auto CompIdx : VOPD::COMPONENTS) {
- const MachineInstr &MI = (CompIdx == VOPD::X) ? FirstMI : SecondMI;
+ const MachineInstr &MI = (CompIdx == VOPD::X) ? MIX : MIY;
const MachineOperand &Src0 = *TII.getNamedOperand(MI, AMDGPU::OpName::src0);
if (Src0.isReg()) {
@@ -153,8 +145,8 @@ bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII,
// On GFX12+ if both OpX and OpY are V_MOV_B32 then OPY uses SRC2
// source-cache.
bool SkipSrc = ST.getGeneration() >= AMDGPUSubtarget::GFX12 &&
- FirstMI.getOpcode() == AMDGPU::V_MOV_B32_e32 &&
- SecondMI.getOpcode() == AMDGPU::V_MOV_B32_e32;
+ MIX.getOpcode() == AMDGPU::V_MOV_B32_e32 &&
+ MIY.getOpcode() == AMDGPU::V_MOV_B32_e32;
bool AllowSameVGPR = ST.hasGFX1250Insts();
if (InstInfo.hasInvalidOperand(getVRegIdx, *TRI, SkipSrc, AllowSameVGPR,
@@ -163,22 +155,23 @@ bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII,
if (IsVOPD3) {
// BITOP3 can be converted to DUAL_BITOP2 only if src2 is zero.
- if (AMDGPU::hasNamedOperand(SecondMI.getOpcode(), AMDGPU::OpName::bitop3)) {
+ // MIX check is only relevant to scheduling?
+ if (AMDGPU::hasNamedOperand(MIX.getOpcode(), AMDGPU::OpName::bitop3)) {
const MachineOperand &Src2 =
- *TII.getNamedOperand(SecondMI, AMDGPU::OpName::src2);
+ *TII.getNamedOperand(MIX, AMDGPU::OpName::src2);
if (!Src2.isImm() || Src2.getImm())
return false;
}
- if (AMDGPU::hasNamedOperand(FirstMI.getOpcode(), AMDGPU::OpName::bitop3)) {
+ if (AMDGPU::hasNamedOperand(MIY.getOpcode(), AMDGPU::OpName::bitop3)) {
const MachineOperand &Src2 =
- *TII.getNamedOperand(FirstMI, AMDGPU::OpName::src2);
+ *TII.getNamedOperand(MIY, AMDGPU::OpName::src2);
if (!Src2.isImm() || Src2.getImm())
return false;
}
}
- LLVM_DEBUG(dbgs() << "VOPD Reg Constraints Passed\n\tX: " << FirstMI
- << "\n\tY: " << SecondMI << "\n");
+ LLVM_DEBUG(dbgs() << "VOPD Reg Constraints Passed\n\tX: " << MIX
+ << "\n\tY: " << MIY << "\n");
return true;
}
@@ -208,6 +201,15 @@ static bool shouldScheduleVOPDAdjacent(const TargetInstrInfo &TII,
(FirstCanBeVOPD.Y && SecondCanBeVOPD.X)))
return false;
+ assert([&]() -> bool {
+ for (auto MII = MachineBasicBlock::const_iterator(FirstMI);
+ MII != FirstMI->getParent()->instr_end(); ++MII) {
+ if (&*MII == &SecondMI)
+ return true;
+ }
+ return false;
+ }() && "Expected FirstMI to precede SecondMI");
+
return checkVOPDRegConstraints(STII, *FirstMI, SecondMI, VOPD3);
};
diff --git a/llvm/lib/Target/AMDGPU/InstCombineTables.td b/llvm/lib/Target/AMDGPU/InstCombineTables.td
deleted file mode 100644
index 3d62641..0000000
--- a/llvm/lib/Target/AMDGPU/InstCombineTables.td
+++ /dev/null
@@ -1,10 +0,0 @@
-include "AMDGPU.td"
-
-def AMDGPUImageDMaskIntrinsicTable : GenericTable {
- let FilterClass = "AMDGPUImageDMaskIntrinsic";
- let Fields = ["Intr"];
-
- let PrimaryKey = ["Intr"];
- let PrimaryKeyName = "getAMDGPUImageDMaskIntrinsic";
- let PrimaryKeyEarlyOut = 1;
-}
diff --git a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
index afaa190..9ec1213 100644
--- a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
+++ b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
@@ -322,13 +322,13 @@ bool AMDGPUCustomBehaviour::hasModifiersSet(
}
// taken from SIInstrInfo::isGWS()
-bool AMDGPUCustomBehaviour::isGWS(uint16_t Opcode) const {
+bool AMDGPUCustomBehaviour::isGWS(uint32_t Opcode) const {
const MCInstrDesc &MCID = MCII.get(Opcode);
return MCID.TSFlags & SIInstrFlags::GWS;
}
// taken from SIInstrInfo::isAlwaysGDS()
-bool AMDGPUCustomBehaviour::isAlwaysGDS(uint16_t Opcode) const {
+bool AMDGPUCustomBehaviour::isAlwaysGDS(uint32_t Opcode) const {
return Opcode == AMDGPU::DS_ORDERED_COUNT ||
Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
diff --git a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h
index cbc7427..aeb5c03 100644
--- a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h
+++ b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h
@@ -32,7 +32,7 @@ public:
AMDGPUInstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII)
: InstrPostProcess(STI, MCII) {}
- ~AMDGPUInstrPostProcess() = default;
+ ~AMDGPUInstrPostProcess() override = default;
void postProcessInstruction(Instruction &Inst, const MCInst &MCI) override;
};
@@ -68,9 +68,9 @@ class AMDGPUCustomBehaviour : public CustomBehaviour {
bool hasModifiersSet(const std::unique_ptr<Instruction> &Inst,
AMDGPU::OpName OpName) const;
/// Helper function used in generateWaitCntInfo()
- bool isGWS(uint16_t Opcode) const;
+ bool isGWS(uint32_t Opcode) const;
/// Helper function used in generateWaitCntInfo()
- bool isAlwaysGDS(uint16_t Opcode) const;
+ bool isAlwaysGDS(uint32_t Opcode) const;
/// Helper function used in generateWaitCntInfo()
bool isVMEM(const MCInstrDesc &MCID);
/// This method gets called from checkCustomHazard when mca is attempting to
@@ -88,7 +88,7 @@ public:
AMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
const mca::SourceMgr &SrcMgr, const MCInstrInfo &MCII);
- ~AMDGPUCustomBehaviour() = default;
+ ~AMDGPUCustomBehaviour() override = default;
/// This method is used to determine if an instruction
/// should be allowed to be dispatched. The return value is
/// how many cycles until the instruction can be dispatched.
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index 703ec0a..4aa4083 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -12,6 +12,7 @@
#include "SIDefines.h"
#include "Utils/AMDGPUAsmUtils.h"
#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/StringExtras.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
@@ -336,7 +337,7 @@ void AMDGPUInstPrinter::printSymbolicFormat(const MCInst *MI,
// \returns a low 256 vgpr representing a high vgpr \p Reg [v256..v1023] or
// \p Reg itself otherwise.
-static MCPhysReg getRegForPrinting(MCPhysReg Reg, const MCRegisterInfo &MRI) {
+static MCRegister getRegForPrinting(MCRegister Reg, const MCRegisterInfo &MRI) {
unsigned Enc = MRI.getEncodingValue(Reg);
unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK;
if (Idx < 0x100)
@@ -355,10 +356,10 @@ static MCPhysReg getRegForPrinting(MCPhysReg Reg, const MCRegisterInfo &MRI) {
}
// Restore MSBs of a VGPR above 255 from the MCInstrAnalysis.
-static MCPhysReg getRegFromMIA(MCPhysReg Reg, unsigned OpNo,
- const MCInstrDesc &Desc,
- const MCRegisterInfo &MRI,
- const AMDGPUMCInstrAnalysis &MIA) {
+static MCRegister getRegFromMIA(MCRegister Reg, unsigned OpNo,
+ const MCInstrDesc &Desc,
+ const MCRegisterInfo &MRI,
+ const AMDGPUMCInstrAnalysis &MIA) {
unsigned VgprMSBs = MIA.getVgprMSBs();
if (!VgprMSBs)
return Reg;
@@ -403,10 +404,10 @@ void AMDGPUInstPrinter::printRegOperand(MCRegister Reg, raw_ostream &O,
}
#endif
- unsigned PrintReg = getRegForPrinting(Reg, MRI);
+ MCRegister PrintReg = getRegForPrinting(Reg, MRI);
O << getRegisterName(PrintReg);
- if (PrintReg != Reg.id())
+ if (PrintReg != Reg)
O << " /*" << getRegisterName(Reg) << "*/";
}
@@ -490,6 +491,18 @@ void AMDGPUInstPrinter::printVINTRPDst(const MCInst *MI, unsigned OpNo,
printRegularOperand(MI, OpNo, STI, O);
}
+void AMDGPUInstPrinter::printAVLdSt32Align2RegOp(const MCInst *MI,
+ unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ MCRegister Reg = MI->getOperand(OpNo).getReg();
+
+ // On targets with an even alignment requirement
+ if (MCRegister SubReg = MRI.getSubReg(Reg, AMDGPU::sub0))
+ Reg = SubReg;
+ printRegOperand(Reg, O, MRI);
+}
+
void AMDGPUInstPrinter::printImmediateInt16(uint32_t Imm,
const MCSubtargetInfo &STI,
raw_ostream &O) {
@@ -610,6 +623,25 @@ void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm, uint8_t OpType,
printImmediateFP16(static_cast<uint16_t>(Imm), STI, O))
return;
break;
+ case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT: {
+ if (AMDGPU::isGFX11Plus(STI)) {
+ // For GFX11+, the inline constant is duplicated to both channels, so we
+ // need to check if the low and high 16 bits are the same, and then if
+ // they can be printed as inline constant values.
+ uint16_t Lo16 = static_cast<uint16_t>(Imm & 0xFFFF);
+ uint16_t Hi16 = static_cast<uint16_t>((Imm >> 16) & 0xFFFF);
+ if (Lo16 == Hi16 &&
+ printImmediateFP16(static_cast<uint16_t>(Imm), STI, O))
+ return;
+ } else {
+ // For pre-GFX11, the inline constant is in the low 16 bits, so we need
+ // to check if it can be printed as inline constant value.
+ if (isUInt<16>(Imm) &&
+ printImmediateFP16(static_cast<uint16_t>(Imm), STI, O))
+ return;
+ }
+ break;
+ }
case AMDGPU::OPERAND_REG_IMM_V2BF16:
case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
if (isUInt<16>(Imm) &&
@@ -795,14 +827,24 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo,
// Intention: print disassembler message when invalid code is decoded,
// for example sgpr register used in VReg or VISrc(VReg or imm) operand.
const MCOperandInfo &OpInfo = Desc.operands()[OpNo];
- int16_t RCID = MII.getOpRegClassID(
- OpInfo, STI.getHwMode(MCSubtargetInfo::HwMode_RegInfo));
- if (RCID != -1) {
+ if (OpInfo.RegClass != -1) {
+ int16_t RCID = MII.getOpRegClassID(
+ OpInfo, STI.getHwMode(MCSubtargetInfo::HwMode_RegInfo));
const MCRegisterClass &RC = MRI.getRegClass(RCID);
auto Reg = mc2PseudoReg(Op.getReg());
if (!RC.contains(Reg) && !isInlineValue(Reg)) {
- O << "/*Invalid register, operand has \'" << MRI.getRegClassName(&RC)
- << "\' register class*/";
+ bool IsWaveSizeOp = OpInfo.isLookupRegClassByHwMode() &&
+ (OpInfo.RegClass == AMDGPU::SReg_1 ||
+ OpInfo.RegClass == AMDGPU::SReg_1_XEXEC);
+ // Suppress this comment for a mismatched wavesize. Some users expect to
+ // be able to assemble and disassemble modules with mixed wavesizes, but
+ // we do not know the subtarget in different functions in MC.
+ //
+ // TODO: Should probably print it anyway, maybe a more specific version.
+ if (!IsWaveSizeOp) {
+ O << "/*Invalid register, operand has \'" << MRI.getRegClassName(&RC)
+ << "\' register class*/";
+ }
}
}
} else if (Op.isImm()) {
@@ -844,6 +886,7 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo,
case AMDGPU::OPERAND_REG_IMM_V2INT16:
case AMDGPU::OPERAND_REG_IMM_V2BF16:
case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT:
case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
@@ -1331,12 +1374,9 @@ void AMDGPUInstPrinter::printPackedModifier(const MCInst *MI,
return;
O << Name;
- for (int I = 0; I < NumOps; ++I) {
- if (I != 0)
- O << ',';
-
- O << !!(Ops[I] & Mod);
- }
+ ListSeparator Sep(",");
+ for (int I = 0; I < NumOps; ++I)
+ O << Sep << !!(Ops[I] & Mod);
if (HasDstSel) {
O << ',' << !!(Ops[0] & SISrcMods::DST_OP_SEL);
@@ -1428,26 +1468,10 @@ void AMDGPUInstPrinter::printMatrixFMT(const MCInst *MI, unsigned OpNo,
return;
O << " matrix_" << AorB << "_fmt:";
- switch (Imm) {
- default:
+ if (Imm < static_cast<int64_t>(std::size(WMMAMods::ModMatrixFmt)))
+ O << WMMAMods::ModMatrixFmt[Imm];
+ else
O << Imm;
- break;
- case WMMA::MatrixFMT::MATRIX_FMT_FP8:
- O << "MATRIX_FMT_FP8";
- break;
- case WMMA::MatrixFMT::MATRIX_FMT_BF8:
- O << "MATRIX_FMT_BF8";
- break;
- case WMMA::MatrixFMT::MATRIX_FMT_FP6:
- O << "MATRIX_FMT_FP6";
- break;
- case WMMA::MatrixFMT::MATRIX_FMT_BF6:
- O << "MATRIX_FMT_BF6";
- break;
- case WMMA::MatrixFMT::MATRIX_FMT_FP4:
- O << "MATRIX_FMT_FP4";
- break;
- }
}
void AMDGPUInstPrinter::printMatrixAFMT(const MCInst *MI, unsigned OpNo,
@@ -1470,17 +1494,10 @@ void AMDGPUInstPrinter::printMatrixScale(const MCInst *MI, unsigned OpNo,
return;
O << " matrix_" << AorB << "_scale:";
- switch (Imm) {
- default:
+ if (Imm < static_cast<int64_t>(std::size(WMMAMods::ModMatrixScale)))
+ O << WMMAMods::ModMatrixScale[Imm];
+ else
O << Imm;
- break;
- case WMMA::MatrixScale::MATRIX_SCALE_ROW0:
- O << "MATRIX_SCALE_ROW0";
- break;
- case WMMA::MatrixScale::MATRIX_SCALE_ROW1:
- O << "MATRIX_SCALE_ROW1";
- break;
- }
}
void AMDGPUInstPrinter::printMatrixAScale(const MCInst *MI, unsigned OpNo,
@@ -1503,20 +1520,10 @@ void AMDGPUInstPrinter::printMatrixScaleFmt(const MCInst *MI, unsigned OpNo,
return;
O << " matrix_" << AorB << "_scale_fmt:";
- switch (Imm) {
- default:
+ if (Imm < static_cast<int64_t>(std::size(WMMAMods::ModMatrixScaleFmt)))
+ O << WMMAMods::ModMatrixScaleFmt[Imm];
+ else
O << Imm;
- break;
- case WMMA::MatrixScaleFmt::MATRIX_SCALE_FMT_E8:
- O << "MATRIX_SCALE_FMT_E8";
- break;
- case WMMA::MatrixScaleFmt::MATRIX_SCALE_FMT_E5M3:
- O << "MATRIX_SCALE_FMT_E5M3";
- break;
- case WMMA::MatrixScaleFmt::MATRIX_SCALE_FMT_E4M3:
- O << "MATRIX_SCALE_FMT_E4M3";
- break;
- }
}
void AMDGPUInstPrinter::printMatrixAScaleFmt(const MCInst *MI, unsigned OpNo,
@@ -1574,14 +1581,10 @@ void AMDGPUInstPrinter::printGPRIdxMode(const MCInst *MI, unsigned OpNo,
O << formatHex(static_cast<uint64_t>(Val));
} else {
O << "gpr_idx(";
- bool NeedComma = false;
+ ListSeparator Sep(",");
for (unsigned ModeId = ID_MIN; ModeId <= ID_MAX; ++ModeId) {
- if (Val & (1 << ModeId)) {
- if (NeedComma)
- O << ',';
- O << IdSymbolic[ModeId];
- NeedComma = true;
- }
+ if (Val & (1 << ModeId))
+ O << Sep << IdSymbolic[ModeId];
}
O << ')';
}
@@ -1658,6 +1661,19 @@ void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo,
}
}
+void AMDGPUInstPrinter::printWaitEvent(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ using namespace llvm::AMDGPU::WaitEvent;
+ const uint16_t Imm16 = static_cast<uint16_t>(MI->getOperand(OpNo).getImm());
+
+ StringRef EventName = getWaitEventMaskName(Imm16, STI);
+ if (EventName.empty())
+ O << formatHex(static_cast<uint64_t>(Imm16));
+ else
+ O << EventName;
+}
+
static void printSwizzleBitmask(const uint16_t AndMask,
const uint16_t OrMask,
const uint16_t XorMask,
@@ -1788,25 +1804,16 @@ void AMDGPUInstPrinter::printSWaitCnt(const MCInst *MI, unsigned OpNo,
bool IsDefaultLgkmcnt = Lgkmcnt == getLgkmcntBitMask(ISA);
bool PrintAll = IsDefaultVmcnt && IsDefaultExpcnt && IsDefaultLgkmcnt;
- bool NeedSpace = false;
+ ListSeparator Sep(" ");
- if (!IsDefaultVmcnt || PrintAll) {
- O << "vmcnt(" << Vmcnt << ')';
- NeedSpace = true;
- }
+ if (!IsDefaultVmcnt || PrintAll)
+ O << Sep << "vmcnt(" << Vmcnt << ')';
- if (!IsDefaultExpcnt || PrintAll) {
- if (NeedSpace)
- O << ' ';
- O << "expcnt(" << Expcnt << ')';
- NeedSpace = true;
- }
+ if (!IsDefaultExpcnt || PrintAll)
+ O << Sep << "expcnt(" << Expcnt << ')';
- if (!IsDefaultLgkmcnt || PrintAll) {
- if (NeedSpace)
- O << ' ';
- O << "lgkmcnt(" << Lgkmcnt << ')';
- }
+ if (!IsDefaultLgkmcnt || PrintAll)
+ O << Sep << "lgkmcnt(" << Lgkmcnt << ')';
}
void AMDGPUInstPrinter::printDepCtr(const MCInst *MI, unsigned OpNo,
@@ -1822,14 +1829,10 @@ void AMDGPUInstPrinter::printDepCtr(const MCInst *MI, unsigned OpNo,
StringRef Name;
unsigned Val;
bool IsDefault;
- bool NeedSpace = false;
+ ListSeparator Sep(" ");
while (decodeDepCtr(Imm16, Id, Name, Val, IsDefault, STI)) {
- if (!IsDefault || !HasNonDefaultVal) {
- if (NeedSpace)
- O << ' ';
- O << Name << '(' << Val << ')';
- NeedSpace = true;
- }
+ if (!IsDefault || !HasNonDefaultVal)
+ O << Sep << Name << '(' << Val << ')';
}
} else {
O << formatHex(Imm16);
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index b27295e..5e9ebc6 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -77,6 +77,9 @@ private:
raw_ostream &O);
void printVINTRPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
+ void printAVLdSt32Align2RegOp(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+
void printImmediateInt16(uint32_t Imm, const MCSubtargetInfo &STI,
raw_ostream &O);
void printImmediateBF16(uint32_t Imm, const MCSubtargetInfo &STI,
@@ -232,6 +235,8 @@ protected:
raw_ostream &O);
void printSendMsg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
+ void printWaitEvent(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
void printSwizzle(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
void printSWaitCnt(const MCInst *MI, unsigned OpNo,
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
index ea758bb..029d2ea 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -95,6 +95,13 @@ private:
void getBinaryCodeForInstr(const MCInst &MI, SmallVectorImpl<MCFixup> &Fixups,
APInt &Inst, APInt &Scratch,
const MCSubtargetInfo &STI) const;
+
+ template <bool HasSrc0, bool HasSrc1, bool HasSrc2>
+ APInt postEncodeVOP3(const MCInst &MI, APInt EncodedValue,
+ const MCSubtargetInfo &STI) const;
+
+ APInt postEncodeVOPCX(const MCInst &MI, APInt EncodedValue,
+ const MCSubtargetInfo &STI) const;
};
} // end anonymous namespace
@@ -343,6 +350,14 @@ std::optional<uint64_t> AMDGPUMCCodeEmitter::getLitEncoding(
return AMDGPU::getInlineEncodingV2F16(static_cast<uint32_t>(Imm))
.value_or(255);
+ case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT:
+ // V_PK_FMAC_F16 has different inline constant behavior on pre-GFX11 vs
+ // GFX11+: pre-GFX11 produces (f16, 0), GFX11+ duplicates f16 to both
+ // halves.
+ return AMDGPU::getPKFMACF16InlineEncoding(static_cast<uint32_t>(Imm),
+ AMDGPU::isGFX11Plus(STI))
+ .value_or(255);
+
case AMDGPU::OPERAND_REG_IMM_V2BF16:
case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
return AMDGPU::getInlineEncodingV2BF16(static_cast<uint32_t>(Imm))
@@ -374,11 +389,6 @@ uint64_t AMDGPUMCCodeEmitter::getImplicitOpSelHiEncoding(int Opcode) const {
return OP_SEL_HI_0 | OP_SEL_HI_1 | OP_SEL_HI_2;
}
-static bool isVCMPX64(const MCInstrDesc &Desc) {
- return (Desc.TSFlags & SIInstrFlags::VOP3) &&
- Desc.hasImplicitDefOfPhysReg(AMDGPU::EXEC);
-}
-
void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI,
SmallVectorImpl<char> &CB,
SmallVectorImpl<MCFixup> &Fixups,
@@ -403,18 +413,6 @@ void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI,
Encoding |= getImplicitOpSelHiEncoding(Opcode);
}
- // GFX10+ v_cmpx opcodes promoted to VOP3 have implied dst=EXEC.
- // Documentation requires dst to be encoded as EXEC (0x7E),
- // but it looks like the actual value encoded for dst operand
- // is ignored by HW. It was decided to define dst as "do not care"
- // in td files to allow disassembler accept any dst value.
- // However, dst is encoded as EXEC for compatibility with SP3.
- if (AMDGPU::isGFX10Plus(STI) && isVCMPX64(Desc)) {
- assert((Encoding & 0xFF) == 0);
- Encoding |= MRI.getEncodingValue(AMDGPU::EXEC_LO) &
- AMDGPU::HWEncoding::LO256_REG_IDX_MASK;
- }
-
for (unsigned i = 0; i < bytes; i++) {
CB.push_back((uint8_t)Encoding.extractBitsAsZExtValue(8, 8 * i));
}
@@ -733,4 +731,37 @@ void AMDGPUMCCodeEmitter::getMachineOpValueCommon(
llvm_unreachable("Encoding of this operand type is not supported yet.");
}
+template <bool HasSrc0, bool HasSrc1, bool HasSrc2>
+APInt AMDGPUMCCodeEmitter::postEncodeVOP3(const MCInst &MI, APInt EncodedValue,
+ const MCSubtargetInfo &STI) const {
+ if (!AMDGPU::isGFX10Plus(STI))
+ return EncodedValue;
+ // Set unused source fields in VOP3 encodings to inline immediate 0 to avoid
+ // hardware conservatively assuming the instruction reads SGPRs.
+ constexpr uint64_t InlineImmediate0 = 0x80;
+ if (!HasSrc0)
+ EncodedValue |= InlineImmediate0 << 32;
+ if (!HasSrc1)
+ EncodedValue |= InlineImmediate0 << 41;
+ if (!HasSrc2)
+ EncodedValue |= InlineImmediate0 << 50;
+ return EncodedValue;
+}
+
+APInt AMDGPUMCCodeEmitter::postEncodeVOPCX(const MCInst &MI, APInt EncodedValue,
+ const MCSubtargetInfo &STI) const {
+ // GFX10+ v_cmpx opcodes promoted to VOP3 have implied dst=EXEC.
+ // Documentation requires dst to be encoded as EXEC (0x7E),
+ // but it looks like the actual value encoded for dst operand
+ // is ignored by HW. It was decided to define dst as "do not care"
+ // in td files to allow disassembler accept any dst value.
+ // However, dst is encoded as EXEC for compatibility with SP3.
+ [[maybe_unused]] const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+ assert((Desc.TSFlags & SIInstrFlags::VOP3) &&
+ Desc.hasImplicitDefOfPhysReg(AMDGPU::EXEC));
+ EncodedValue |= MRI.getEncodingValue(AMDGPU::EXEC_LO) &
+ AMDGPU::HWEncoding::LO256_REG_IDX_MASK;
+ return postEncodeVOP3<true, true, false>(MI, EncodedValue, STI);
+}
+
#include "AMDGPUGenMCCodeEmitter.inc"
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
index c27be02..63437779 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
@@ -7,9 +7,7 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUMCExpr.h"
-#include "GCNSubtarget.h"
#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/IR/Function.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCAssembler.h"
#include "llvm/MC/MCContext.h"
@@ -317,30 +315,6 @@ const AMDGPUMCExpr *AMDGPUMCExpr::createTotalNumVGPR(const MCExpr *NumAGPR,
return create(AGVK_TotalNumVGPRs, {NumAGPR, NumVGPR}, Ctx);
}
-/// Mimics GCNSubtarget::computeOccupancy for MCExpr.
-///
-/// Remove dependency on GCNSubtarget and depend only only the necessary values
-/// for said occupancy computation. Should match computeOccupancy implementation
-/// without passing \p STM on.
-const AMDGPUMCExpr *AMDGPUMCExpr::createOccupancy(
- unsigned InitOcc, const MCExpr *NumSGPRs, const MCExpr *NumVGPRs,
- unsigned DynamicVGPRBlockSize, const GCNSubtarget &STM, MCContext &Ctx) {
- unsigned MaxWaves = IsaInfo::getMaxWavesPerEU(&STM);
- unsigned Granule = IsaInfo::getVGPRAllocGranule(&STM, DynamicVGPRBlockSize);
- unsigned TargetTotalNumVGPRs = IsaInfo::getTotalNumVGPRs(&STM);
- unsigned Generation = STM.getGeneration();
-
- auto CreateExpr = [&Ctx](unsigned Value) {
- return MCConstantExpr::create(Value, Ctx);
- };
-
- return create(AGVK_Occupancy,
- {CreateExpr(MaxWaves), CreateExpr(Granule),
- CreateExpr(TargetTotalNumVGPRs), CreateExpr(Generation),
- CreateExpr(InitOcc), NumSGPRs, NumVGPRs},
- Ctx);
-}
-
const AMDGPUMCExpr *AMDGPUMCExpr::createLit(LitModifier Lit, int64_t Value,
MCContext &Ctx) {
assert(Lit == LitModifier::Lit || Lit == LitModifier::Lit64);
@@ -481,7 +455,7 @@ static void unaryOpKnownBitsMapHelper(const MCExpr *Expr, KnownBitsMap &KBM,
return;
case MCUnaryExpr::Opcode::Minus: {
KB.makeNegative();
- KBM[Expr] = KB;
+ KBM[Expr] = std::move(KB);
return;
}
case MCUnaryExpr::Opcode::Not: {
@@ -492,7 +466,7 @@ static void unaryOpKnownBitsMapHelper(const MCExpr *Expr, KnownBitsMap &KBM,
}
case MCUnaryExpr::Opcode::Plus: {
KB.makeNonNegative();
- KBM[Expr] = KB;
+ KBM[Expr] = std::move(KB);
return;
}
}
@@ -514,7 +488,7 @@ static void targetOpKnownBitsMapHelper(const MCExpr *Expr, KnownBitsMap &KBM,
knownBitsMapHelper(Arg, KBM, Depth + 1);
KB |= KBM[Arg];
}
- KBM[Expr] = KB;
+ KBM[Expr] = std::move(KB);
return;
}
case AMDGPUMCExpr::VariantKind::AGVK_Max: {
@@ -524,7 +498,7 @@ static void targetOpKnownBitsMapHelper(const MCExpr *Expr, KnownBitsMap &KBM,
knownBitsMapHelper(Arg, KBM, Depth + 1);
KB = KnownBits::umax(KB, KBM[Arg]);
}
- KBM[Expr] = KB;
+ KBM[Expr] = std::move(KB);
return;
}
case AMDGPUMCExpr::VariantKind::AGVK_ExtraSGPRs:
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
index 54fcd2a..bf7b40b 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
@@ -64,7 +64,7 @@ private:
ArrayRef<const MCExpr *> Args;
AMDGPUMCExpr(VariantKind Kind, ArrayRef<const MCExpr *> Args, MCContext &Ctx);
- ~AMDGPUMCExpr();
+ ~AMDGPUMCExpr() override;
bool evaluateExtraSGPRs(MCValue &Res, const MCAssembler *Asm) const;
bool evaluateTotalNumVGPR(MCValue &Res, const MCAssembler *Asm) const;
@@ -98,11 +98,6 @@ public:
return create(VariantKind::AGVK_AlignTo, {Value, Align}, Ctx);
}
- static const AMDGPUMCExpr *
- createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs,
- const MCExpr *NumVGPRs, unsigned DynamicVGPRBlockSize,
- const GCNSubtarget &STM, MCContext &Ctx);
-
static const AMDGPUMCExpr *createLit(LitModifier Lit, int64_t Value,
MCContext &Ctx);
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index 013cfeb..28b4da8 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -168,7 +168,7 @@ bool AMDGPUMCInstrAnalysis::evaluateBranch(const MCInst &Inst, uint64_t Addr,
void AMDGPUMCInstrAnalysis::updateState(const MCInst &Inst, uint64_t Addr) {
if (Inst.getOpcode() == AMDGPU::S_SET_VGPR_MSB_gfx12)
- VgprMSBs = Inst.getOperand(0).getImm();
+ VgprMSBs = Inst.getOperand(0).getImm() & 0xff;
else if (isTerminator(Inst))
VgprMSBs = 0;
}
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 5a08573..86c5d1c 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -114,10 +114,12 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) {
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1151: AK = GK_GFX1151; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1152: AK = GK_GFX1152; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1153: AK = GK_GFX1153; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1170: AK = GK_GFX1170; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1200: AK = GK_GFX1200; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1201: AK = GK_GFX1201; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1250: AK = GK_GFX1250; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1251: AK = GK_GFX1251; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1310: AK = GK_GFX1310; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC: AK = GK_GFX9_GENERIC; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC: AK = GK_GFX9_4_GENERIC; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC: AK = GK_GFX10_1_GENERIC; break;
@@ -201,10 +203,12 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) {
case GK_GFX1151: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1151;
case GK_GFX1152: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1152;
case GK_GFX1153: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1153;
+ case GK_GFX1170: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1170;
case GK_GFX1200: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1200;
case GK_GFX1201: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1201;
case GK_GFX1250: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1250;
case GK_GFX1251: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1251;
+ case GK_GFX1310: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1310;
case GK_GFX9_GENERIC: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC;
case GK_GFX9_4_GENERIC: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC;
case GK_GFX10_1_GENERIC: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC;
@@ -302,9 +306,9 @@ void AMDGPUTargetAsmStreamer::EmitMCResourceInfo(
#undef PRINT_RES_INFO
}
-void AMDGPUTargetAsmStreamer::EmitMCResourceMaximums(const MCSymbol *MaxVGPR,
- const MCSymbol *MaxAGPR,
- const MCSymbol *MaxSGPR) {
+void AMDGPUTargetAsmStreamer::EmitMCResourceMaximums(
+ const MCSymbol *MaxVGPR, const MCSymbol *MaxAGPR, const MCSymbol *MaxSGPR,
+ const MCSymbol *MaxNamedBarrier) {
#define PRINT_RES_INFO(ARG) \
OS << "\t.set "; \
ARG->print(OS, getContext().getAsmInfo()); \
@@ -315,6 +319,7 @@ void AMDGPUTargetAsmStreamer::EmitMCResourceMaximums(const MCSymbol *MaxVGPR,
PRINT_RES_INFO(MaxVGPR);
PRINT_RES_INFO(MaxAGPR);
PRINT_RES_INFO(MaxSGPR);
+ PRINT_RES_INFO(MaxNamedBarrier);
#undef PRINT_RES_INFO
}
@@ -398,7 +403,7 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
EmitMCExpr(KD.kernarg_size);
OS << '\n';
- if (isGFX1250(STI)) {
+ if (isGFX1250Plus(STI)) {
PrintField(KD.compute_pgm_rsrc2,
amdhsa::COMPUTE_PGM_RSRC2_GFX125_USER_SGPR_COUNT_SHIFT,
amdhsa::COMPUTE_PGM_RSRC2_GFX125_USER_SGPR_COUNT,
@@ -512,7 +517,7 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
OS << '\n';
}
- if (AMDGPU::isGFX1250(STI))
+ if (isGFX1250Plus(STI))
PrintField(KD.compute_pgm_rsrc3,
amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT,
amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT,
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index 22afcde..3a0d8dc 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -69,7 +69,8 @@ public:
virtual void EmitMCResourceMaximums(const MCSymbol *MaxVGPR,
const MCSymbol *MaxAGPR,
- const MCSymbol *MaxSGPR) {};
+ const MCSymbol *MaxSGPR,
+ const MCSymbol *MaxNamedBarrier) {};
/// \returns True on success, false on failure.
virtual bool EmitISAVersion() { return true; }
@@ -149,7 +150,8 @@ public:
const MCSymbol *HasRecursion, const MCSymbol *HasIndirectCall) override;
void EmitMCResourceMaximums(const MCSymbol *MaxVGPR, const MCSymbol *MaxAGPR,
- const MCSymbol *MaxSGPR) override;
+ const MCSymbol *MaxSGPR,
+ const MCSymbol *MaxNamedBarrier) override;
/// \returns True on success, false on failure.
bool EmitISAVersion() override;
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 5f6d742..b023c96 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -877,69 +877,69 @@ multiclass MIMG_Store <mimgopc op, string asm, bit has_d16, bit mip = 0> {
}
class MIMG_Atomic_gfx6789_base <bits<8> op, string asm, RegisterOperand data_rc,
- RegisterClass addr_rc, string dns="">
- : MIMG_gfx6789 <op, (outs data_rc:$vdst), dns> {
- let Constraints = "$vdst = $vdata";
-
+ RegisterClass addr_rc, bit noRtn, string dns="">
+ : MIMG_gfx6789 <op, !if(noRtn, (outs), (outs data_rc:$vdst)), dns> {
+ let Constraints = !if(noRtn, "", "$vdst = $vdata");
+ let isCodeGenOnly = noRtn;
let InOperandList = (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256_XNULL:$srsrc,
DMask:$dmask, UNorm:$unorm, CPol:$cpol,
R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da);
- let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$cpol$r128$tfe$lwe$da";
+ let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$cpol$r128$tfe$lwe$da";
}
class MIMG_Atomic_gfx90a_base <bits<8> op, string asm, RegisterOperand data_rc,
- RegisterClass addr_rc, string dns="">
- : MIMG_gfx90a <op, (outs getAlign2RegOp<data_rc>.ret:$vdst), dns> {
- let Constraints = "$vdst = $vdata";
-
+ RegisterClass addr_rc, bit noRtn, string dns="">
+ : MIMG_gfx90a <op, !if(noRtn, (outs), (outs getAlign2RegOp<data_rc>.ret:$vdst)), dns> {
+ let Constraints = !if(noRtn, "", "$vdst = $vdata");
+ let isCodeGenOnly = noRtn;
let InOperandList = (ins getAlign2RegOp<data_rc>.ret:$vdata,
addr_rc:$vaddr, SReg_256_XNULL:$srsrc,
DMask:$dmask, UNorm:$unorm, CPol:$cpol,
R128A16:$r128, LWE:$lwe, DA:$da);
- let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$cpol$r128$lwe$da";
+ let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$cpol$r128$lwe$da";
}
class MIMG_Atomic_si<mimgopc op, string asm, RegisterOperand data_rc,
- RegisterClass addr_rc, bit enableDasm = 0>
- : MIMG_Atomic_gfx6789_base<op.SI, asm, data_rc, addr_rc,
+ RegisterClass addr_rc, bit noRtn = 0, bit enableDasm = 0>
+ : MIMG_Atomic_gfx6789_base<op.SI, asm, data_rc, addr_rc, noRtn,
!if(enableDasm, "GFX6GFX7", "")> {
let AssemblerPredicate = isGFX6GFX7;
}
class MIMG_Atomic_vi<mimgopc op, string asm, RegisterOperand data_rc,
- RegisterClass addr_rc, bit enableDasm = 0>
- : MIMG_Atomic_gfx6789_base<op.VI, asm, data_rc, addr_rc, !if(enableDasm, "GFX8", "")> {
+ RegisterClass addr_rc, bit noRtn = 0, bit enableDasm = 0>
+ : MIMG_Atomic_gfx6789_base<op.VI, asm, data_rc, addr_rc, noRtn, !if(enableDasm, "GFX8", "")> {
let AssemblerPredicate = isGFX8GFX9NotGFX90A;
let MIMGEncoding = MIMGEncGfx8;
}
class MIMG_Atomic_gfx90a<mimgopc op, string asm, RegisterOperand data_rc,
- RegisterClass addr_rc, bit enableDasm = 0>
- : MIMG_Atomic_gfx90a_base<op.VI, asm, data_rc, addr_rc, !if(enableDasm, "GFX90A", "")> {
+ RegisterClass addr_rc, bit noRtn = 0, bit enableDasm = 0>
+ : MIMG_Atomic_gfx90a_base<op.VI, asm, data_rc, addr_rc, noRtn, !if(enableDasm, "GFX90A", "")> {
let AssemblerPredicate = isGFX90APlus;
let MIMGEncoding = MIMGEncGfx90a;
}
class MIMG_Atomic_gfx10<mimgopc op, string opcode,
RegisterOperand DataRC, RegisterClass AddrRC,
- bit enableDisasm = 0>
- : MIMG_gfx10<op.GFX10M, (outs DataRC:$vdst),
+ bit noRtn = 0, bit enableDisasm = 0>
+ : MIMG_gfx10<op.GFX10M, !if(noRtn, (outs), (outs DataRC:$vdst)),
!if(enableDisasm, "GFX10", "")> {
- let Constraints = "$vdst = $vdata";
-
+ let Constraints = !if(noRtn, "", "$vdst = $vdata");
+ let isCodeGenOnly = noRtn;
let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256_XNULL:$srsrc,
DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol,
R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe);
- let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe";
+ let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe";
}
class MIMG_Atomic_nsa_gfx10<mimgopc op, string opcode,
RegisterOperand DataRC, int num_addrs,
- bit enableDisasm = 0>
- : MIMG_nsa_gfx10<op.GFX10M, (outs DataRC:$vdst), num_addrs,
+ bit noRtn = 0, bit enableDisasm = 0>
+ : MIMG_nsa_gfx10<op.GFX10M, !if(noRtn, (outs), (outs DataRC:$vdst)), num_addrs,
!if(enableDisasm, "GFX10", "")> {
- let Constraints = "$vdst = $vdata";
-
+ let Constraints = !if(noRtn, "", "$vdst = $vdata");
+ let isCodeGenOnly = noRtn;
let InOperandList = !con((ins DataRC:$vdata),
AddrIns,
(ins SReg_256_XNULL:$srsrc, DMask:$dmask,
@@ -950,24 +950,24 @@ class MIMG_Atomic_nsa_gfx10<mimgopc op, string opcode,
class MIMG_Atomic_gfx11<mimgopc op, string opcode,
RegisterOperand DataRC, RegisterClass AddrRC,
- bit enableDisasm = 0>
- : MIMG_gfx11<op.GFX11, (outs DataRC:$vdst),
+ bit noRtn = 0, bit enableDisasm = 0>
+ : MIMG_gfx11<op.GFX11, !if(noRtn, (outs), (outs DataRC:$vdst)),
!if(enableDisasm, "GFX11", "")> {
- let Constraints = "$vdst = $vdata";
-
+ let Constraints = !if(noRtn, "", "$vdst = $vdata");
+ let isCodeGenOnly = noRtn;
let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256_XNULL:$srsrc,
DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol,
R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe);
- let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe";
+ let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe";
}
class MIMG_Atomic_nsa_gfx11<mimgopc op, string opcode,
RegisterOperand DataRC, int num_addrs,
- bit enableDisasm = 0>
- : MIMG_nsa_gfx11<op.GFX11, (outs DataRC:$vdst), num_addrs,
+ bit noRtn = 0, bit enableDisasm = 0>
+ : MIMG_nsa_gfx11<op.GFX11, !if(noRtn, (outs), (outs DataRC:$vdst)), num_addrs,
!if(enableDisasm, "GFX11", "")> {
- let Constraints = "$vdst = $vdata";
-
+ let Constraints = !if(noRtn, "", "$vdst = $vdata");
+ let isCodeGenOnly = noRtn;
let InOperandList = !con((ins DataRC:$vdata),
AddrIns,
(ins SReg_256_XNULL:$srsrc, DMask:$dmask,
@@ -977,11 +977,11 @@ class MIMG_Atomic_nsa_gfx11<mimgopc op, string opcode,
}
class VIMAGE_Atomic_gfx12<mimgopc op, string opcode, RegisterOperand DataRC,
- int num_addrs, string renamed, bit enableDisasm = 0>
- : VIMAGE_gfx12<op.GFX12, (outs DataRC:$vdst), num_addrs,
+ int num_addrs, string renamed, bit noRtn = 0, bit enableDisasm = 0>
+ : VIMAGE_gfx12<op.GFX12, !if(noRtn, (outs), (outs DataRC:$vdst)), num_addrs,
!if(enableDisasm, "GFX12", "")> {
- let Constraints = "$vdst = $vdata";
-
+ let Constraints = !if(noRtn, "", "$vdst = $vdata");
+ let isCodeGenOnly = noRtn;
let InOperandList = !con((ins DataRC:$vdata),
AddrIns,
(ins SReg_256_XNULL:$rsrc, DMask:$dmask, Dim:$dim,
@@ -994,95 +994,96 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm,
RegisterOperand data_rc,
bit enableDasm = 0,
bit isFP = 0,
+ bit noRtn = 0,
string renamed = ""> {
let hasSideEffects = 1, // FIXME: remove this
mayLoad = 1, mayStore = 1, hasPostISelHook = 0, DisableWQM = 1,
- FPAtomic = isFP in {
+ FPAtomic = isFP, IsAtomicNoRet = noRtn in {
let VAddrDwords = 1 in {
let ssamp = 0 in {
if op.HAS_SI then {
- def _V1_si : MIMG_Atomic_si <op, asm, data_rc, VGPR_32, enableDasm>;
+ def _V1_si : MIMG_Atomic_si <op, asm, data_rc, VGPR_32, noRtn, enableDasm>;
}
if op.HAS_VI then {
- def _V1_vi : MIMG_Atomic_vi <op, asm, data_rc, VGPR_32, enableDasm>;
+ def _V1_vi : MIMG_Atomic_vi <op, asm, data_rc, VGPR_32, noRtn, enableDasm>;
let hasPostISelHook = 1 in
- def _V1_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VGPR_32, enableDasm>;
+ def _V1_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VGPR_32, noRtn, enableDasm>;
}
if op.HAS_GFX10M then {
- def _V1_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VGPR_32, enableDasm>;
+ def _V1_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VGPR_32, noRtn, enableDasm>;
}
if op.HAS_GFX11 then {
- def _V1_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VGPR_32, enableDasm>;
+ def _V1_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VGPR_32, noRtn, enableDasm>;
}
}
if op.HAS_GFX12 then {
- def _V1_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 1, renamed>;
+ def _V1_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 1, renamed, noRtn>;
}
}
let VAddrDwords = 2 in {
let ssamp = 0 in {
if op.HAS_SI then {
- def _V2_si : MIMG_Atomic_si <op, asm, data_rc, VReg_64, 0>;
+ def _V2_si : MIMG_Atomic_si <op, asm, data_rc, VReg_64, noRtn, 0>;
}
if op.HAS_VI then {
- def _V2_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_64, 0>;
- def _V2_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_64_Align2, 0>;
+ def _V2_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_64, noRtn, 0>;
+ def _V2_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_64_Align2, noRtn, 0>;
}
if op.HAS_GFX10M then {
- def _V2_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_64, 0>;
- def _V2_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 2, 0>;
+ def _V2_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_64, noRtn, 0>;
+ def _V2_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 2, noRtn, 0>;
}
if op.HAS_GFX11 then {
- def _V2_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_64, 0>;
- def _V2_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 2, 0>;
+ def _V2_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_64, noRtn, 0>;
+ def _V2_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 2, noRtn, 0>;
}
}
if op.HAS_GFX12 then {
- def _V2_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 2, renamed>;
+ def _V2_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 2, renamed, noRtn>;
}
}
let VAddrDwords = 3 in {
let ssamp = 0 in {
if op.HAS_SI then {
- def _V3_si : MIMG_Atomic_si <op, asm, data_rc, VReg_96, 0>;
+ def _V3_si : MIMG_Atomic_si <op, asm, data_rc, VReg_96, noRtn, 0>;
}
if op.HAS_VI then {
- def _V3_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_96, 0>;
- def _V3_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_96_Align2, 0>;
+ def _V3_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_96, noRtn, 0>;
+ def _V3_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_96_Align2, noRtn, 0>;
}
if op.HAS_GFX10M then {
- def _V3_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_96, 0>;
- def _V3_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 3, 0>;
+ def _V3_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_96, noRtn, 0>;
+ def _V3_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 3, noRtn, 0>;
}
if op.HAS_GFX11 then {
- def _V3_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_96, 0>;
- def _V3_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 3, 0>;
+ def _V3_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_96, noRtn, 0>;
+ def _V3_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 3, noRtn, 0>;
}
}
if op.HAS_GFX12 then {
- def _V3_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 3, renamed>;
+ def _V3_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 3, renamed, noRtn>;
}
}
let VAddrDwords = 4 in {
let ssamp = 0 in {
if op.HAS_SI then {
- def _V4_si : MIMG_Atomic_si <op, asm, data_rc, VReg_128, 0>;
+ def _V4_si : MIMG_Atomic_si <op, asm, data_rc, VReg_128, noRtn, 0>;
}
if op.HAS_VI then {
- def _V4_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_128, 0>;
- def _V4_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_128_Align2, 0>;
+ def _V4_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_128, noRtn, 0>;
+ def _V4_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_128_Align2, noRtn, 0>;
}
if op.HAS_GFX10M then {
- def _V4_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_128, 0>;
- def _V4_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 4, enableDasm>;
+ def _V4_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_128, noRtn, 0>;
+ def _V4_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 4, noRtn, enableDasm>;
}
if op.HAS_GFX11 then {
- def _V4_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_128, 0>;
- def _V4_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 4, enableDasm>;
+ def _V4_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_128, noRtn, 0>;
+ def _V4_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 4, noRtn, enableDasm>;
}
}
if op.HAS_GFX12 then {
- def _V4_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 4, renamed, enableDasm>;
+ def _V4_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 4, renamed, noRtn, enableDasm>;
}
}
}
@@ -1095,12 +1096,13 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm,
}
}
-multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0,
- string renamed = ""> { // 64-bit atomics
- let IsAtomicRet = 1 in {
+multiclass MIMG_Atomic_Base <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0,
+ bit noRtn = 0, string renamed = ""> { // 64-bit atomics
+ let IsAtomicRet = !not(noRtn) in {
def "" : MIMGBaseOpcode {
let Atomic = 1;
let AtomicX2 = isCmpSwap;
+ let NoReturn = noRtn;
}
let BaseOpcode = !cast<MIMGBaseOpcode>(NAME) in {
@@ -1109,22 +1111,28 @@ multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0,
// Other variants are reconstructed by disassembler using dmask and tfe.
if !not(isCmpSwap) then {
let VDataDwords = 1 in
- defm _V1 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_32, 1, isFP, renamed>;
+ defm _V1 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_32, 1, isFP, noRtn, renamed>;
}
let VDataDwords = 2 in
- defm _V2 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_64, isCmpSwap, isFP, renamed>;
+ defm _V2 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_64, isCmpSwap, isFP, noRtn, renamed>;
let VDataDwords = 3 in
- defm _V3 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_96, 0, isFP, renamed>;
+ defm _V3 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_96, 0, isFP, noRtn, renamed>;
if isCmpSwap then {
let VDataDwords = 4 in
- defm _V4 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_128, 0, isFP, renamed>;
+ defm _V4 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_128, 0, isFP, noRtn, renamed>;
let VDataDwords = 5 in
- defm _V5 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_160, 0, isFP, renamed>;
+ defm _V5 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_160, 0, isFP, noRtn, renamed>;
}
}
- } // End IsAtomicRet = 1
+ }
+}
+
+multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0,
+ string renamed = ""> {
+ defm "" : MIMG_Atomic_Base <op, asm, isCmpSwap, isFP, /*noRtn=*/0, renamed>;
+ defm "_NORTN" : MIMG_Atomic_Base <op, asm, isCmpSwap, isFP, /*noRtn=*/1, renamed>;
}
multiclass MIMG_Atomic_Renamed <mimgopc op, string asm, string renamed,
@@ -1820,6 +1828,7 @@ let SubtargetPredicate = isGFX12Plus in {
class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> {
Intrinsic Intr = I;
MIMGBaseOpcode BaseOpcode = !cast<MIMGBaseOpcode>(!strconcat("IMAGE_", I.P.OpMod));
+ MIMGBaseOpcode AtomicNoRetBaseOpcode = BaseOpcode;
AMDGPUDimProps Dim = I.P.Dim;
AMDGPUImageDimIntrinsicEval DimEval = AMDGPUImageDimIntrinsicEval<I.P>;
@@ -1855,13 +1864,20 @@ class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> {
bits<8> CoordTyArg = !add(GradientTyArg, !if(I.P.Gradients, 1, 0));
}
+class ImageDimAtomicIntrinsicInfo<AMDGPUImageDimIntrinsic I>
+ : ImageDimIntrinsicInfo<I> {
+ MIMGBaseOpcode AtomicNoRetBaseOpcode =
+ !cast<MIMGBaseOpcode>(!strconcat("IMAGE_", I.P.OpMod, "_NORTN"));
+}
+
def ImageDimIntrinsicTable : GenericTable {
let FilterClass = "ImageDimIntrinsicInfo";
- let Fields = ["Intr", "BaseOpcode", "Dim", "NumOffsetArgs", "NumBiasArgs", "NumZCompareArgs", "NumGradients", "NumDmask", "NumData", "NumVAddrs", "NumArgs",
- "DMaskIndex", "VAddrStart", "OffsetIndex", "BiasIndex", "ZCompareIndex", "GradientStart", "CoordStart", "LodIndex", "MipIndex", "VAddrEnd",
- "RsrcIndex", "SampIndex", "UnormIndex", "TexFailCtrlIndex", "CachePolicyIndex",
+ let Fields = ["Intr", "BaseOpcode", "AtomicNoRetBaseOpcode", "Dim", "NumOffsetArgs", "NumBiasArgs", "NumZCompareArgs", "NumGradients", "NumDmask", "NumData",
+ "NumVAddrs", "NumArgs", "DMaskIndex", "VAddrStart", "OffsetIndex", "BiasIndex", "ZCompareIndex", "GradientStart", "CoordStart", "LodIndex", "MipIndex",
+ "VAddrEnd", "RsrcIndex", "SampIndex", "UnormIndex", "TexFailCtrlIndex", "CachePolicyIndex",
"BiasTyArg", "GradientTyArg", "CoordTyArg"];
string TypeOf_BaseOpcode = "MIMGBaseOpcode";
+ string TypeOf_AtomicNoRetBaseOpcode = "MIMGBaseOpcode";
string TypeOf_Dim = "MIMGDim";
let PrimaryKey = ["Intr"];
@@ -1874,11 +1890,14 @@ def getImageDimIntrinsicByBaseOpcode : SearchIndex {
let Key = ["BaseOpcode", "Dim"];
}
-foreach intr = !listconcat(AMDGPUImageDimIntrinsics,
- AMDGPUImageDimAtomicIntrinsics) in {
+foreach intr = AMDGPUImageDimIntrinsics in {
def : ImageDimIntrinsicInfo<intr>;
}
+foreach intr = AMDGPUImageDimAtomicIntrinsics in {
+ def : ImageDimAtomicIntrinsicInfo<intr>;
+}
+
// L to LZ Optimization Mapping
def : MIMGLZMapping<IMAGE_SAMPLE_L, IMAGE_SAMPLE_LZ>;
def : MIMGLZMapping<IMAGE_SAMPLE_C_L, IMAGE_SAMPLE_C_LZ>;
@@ -2057,12 +2076,12 @@ class VIMAGE_TENSOR_Pseudo<string opName, bit _UpTo2D = 0> :
string AsmOperands = " $vaddr0, $vaddr1"#!if(UpTo2D, "", ", $vaddr2, $vaddr3")#"$r128$cpol";
}
-let SubtargetPredicate = isGFX1250Plus in {
+let SubtargetPredicate = isGFX125xOnly in {
def TENSOR_LOAD_TO_LDS : VIMAGE_TENSOR_Pseudo<"tensor_load_to_lds">;
def TENSOR_STORE_FROM_LDS : VIMAGE_TENSOR_Pseudo<"tensor_store_from_lds">;
def TENSOR_LOAD_TO_LDS_D2 : VIMAGE_TENSOR_Pseudo<"tensor_load_to_lds", 1>;
def TENSOR_STORE_FROM_LDS_D2 : VIMAGE_TENSOR_Pseudo<"tensor_store_from_lds", 1>;
-} // End SubtargetPredicate = isGFX1250Plus.
+} // End SubtargetPredicate = isGFX125xOnly.
class TensorPat <VIMAGE_TENSOR_Pseudo inst, SDPatternOperator node> : GCNPat <
(node v4i32:$vaddr0, v8i32:$vaddr1, v4i32:$vaddr2, v4i32:$vaddr3, (i32 timm:$cpol)),
@@ -2074,12 +2093,12 @@ class TensorD2Pat <VIMAGE_TENSOR_Pseudo inst, SDPatternOperator node> : GCNPat <
(inst $vaddr0, $vaddr1, 0, $cpol)
>;
-let SubtargetPredicate = isGFX1250Plus in {
+let SubtargetPredicate = isGFX125xOnly in {
def : TensorPat <TENSOR_LOAD_TO_LDS, int_amdgcn_tensor_load_to_lds>;
def : TensorPat <TENSOR_STORE_FROM_LDS, int_amdgcn_tensor_store_from_lds>;
def : TensorD2Pat <TENSOR_LOAD_TO_LDS_D2, int_amdgcn_tensor_load_to_lds_d2>;
def : TensorD2Pat <TENSOR_STORE_FROM_LDS_D2, int_amdgcn_tensor_store_from_lds_d2>;
-}
+} // End SubtargetPredicate = isGFX125xOnly.
class VIMAGE_TENSOR_Real <bits<8> op, VIMAGE_TENSOR_Pseudo ps, string opName = ps.Mnemonic> :
InstSI <ps.OutOperandList, ps.InOperandList, opName # ps.AsmOperands, []>,
@@ -2097,8 +2116,10 @@ class VIMAGE_TENSOR_Real <bits<8> op, VIMAGE_TENSOR_Pseudo ps, string opName = p
let vaddr2 = !if(ps.UpTo2D, !cast<int>(SGPR_NULL_gfx11plus.HWEncoding), ?);
let vaddr3 = !if(ps.UpTo2D, !cast<int>(SGPR_NULL_gfx11plus.HWEncoding), ?);
+ // Set VADDR4 to NULL
+ let vaddr4 = !cast<int>(SGPR_NULL_gfx11plus.HWEncoding);
+
// set to 0 based on SPG.
- let vaddr4 = 0;
let rsrc = 0;
let vdata = 0;
let d16 = 0;
@@ -2109,7 +2130,7 @@ class VIMAGE_TENSOR_Real <bits<8> op, VIMAGE_TENSOR_Pseudo ps, string opName = p
}
multiclass VIMAGE_TENSOR_Real_gfx1250<bits<8> op> {
- let AssemblerPredicate = isGFX1250Plus, DecoderNamespace = "GFX1250" in {
+ let AssemblerPredicate = isGFX125xOnly, DecoderNamespace = "GFX1250" in {
foreach DSuffix = ["_D2", ""] in {
defvar ps = !cast<VIMAGE_TENSOR_Pseudo>(NAME # DSuffix);
def DSuffix # _gfx1250 : VIMAGE_TENSOR_Real<op, ps, ps.Mnemonic>,
diff --git a/llvm/lib/Target/AMDGPU/R600.td b/llvm/lib/Target/AMDGPU/R600.td
index 9148edb..bdfaac9 100644
--- a/llvm/lib/Target/AMDGPU/R600.td
+++ b/llvm/lib/Target/AMDGPU/R600.td
@@ -8,15 +8,6 @@
include "llvm/Target/Target.td"
-def R600InstrInfo : InstrInfo {
- let guessInstructionProperties = 1;
-}
-
-def R600 : Target {
- let InstructionSet = R600InstrInfo;
- let AllowRegisterRenaming = 1;
-}
-
let Namespace = "R600" in {
foreach Index = 0-15 in {
@@ -27,6 +18,18 @@ include "R600RegisterInfo.td"
}
+defm : RemapAllTargetPseudoPointerOperands<R600_Addr>;
+
+def R600InstrInfo : InstrInfo {
+ let guessInstructionProperties = 1;
+}
+
+def R600 : Target {
+ let InstructionSet = R600InstrInfo;
+ let AllowRegisterRenaming = 1;
+}
+
+
def NullALU : InstrItinClass;
def ALU_NULL : FuncUnit;
diff --git a/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
index 81b142e..248d734 100644
--- a/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
@@ -68,7 +68,7 @@ bool CFStack::requiresWorkAroundForInst(unsigned Opcode) {
getLoopDepth() > 1)
return true;
- if (!ST->hasCFAluBug())
+ if (!ST->hasCFALUBug())
return false;
switch(Opcode) {
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index 2aa54c9..90c09fe 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -13,6 +13,7 @@
#include "R600ISelLowering.h"
#include "AMDGPU.h"
+#include "AMDGPUSelectionDAGInfo.h"
#include "MCTargetDesc/R600MCTargetDesc.h"
#include "R600Defines.h"
#include "R600MachineFunctionInfo.h"
@@ -29,7 +30,8 @@ using namespace llvm;
R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
const R600Subtarget &STI)
- : AMDGPUTargetLowering(TM, STI), Subtarget(&STI), Gen(STI.getGeneration()) {
+ : AMDGPUTargetLowering(TM, STI, STI), Subtarget(&STI),
+ Gen(STI.getGeneration()) {
addRegisterClass(MVT::f32, &R600::R600_Reg32RegClass);
addRegisterClass(MVT::i32, &R600::R600_Reg32RegClass);
addRegisterClass(MVT::v2f32, &R600::R600_Reg64RegClass);
@@ -1129,12 +1131,9 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
if ((AS == AMDGPUAS::PRIVATE_ADDRESS) && TruncatingStore) {
// Add an extra level of chain to isolate this vector
SDValue NewChain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, Chain);
- // TODO: can the chain be replaced without creating a new store?
- SDValue NewStore = DAG.getTruncStore(
- NewChain, DL, Value, Ptr, StoreNode->getPointerInfo(), MemVT,
- StoreNode->getAlign(), StoreNode->getMemOperand()->getFlags(),
- StoreNode->getAAInfo());
- StoreNode = cast<StoreSDNode>(NewStore);
+ SmallVector<SDValue, 4> NewOps(StoreNode->ops());
+ NewOps[0] = NewChain;
+ StoreNode = cast<StoreSDNode>(DAG.UpdateNodeOperands(StoreNode, NewOps));
}
return scalarizeVectorStore(StoreNode, DAG);
@@ -1481,6 +1480,9 @@ SDValue R600TargetLowering::LowerFormalArguments(
MemVT = MemVT.getVectorElementType();
}
+ if (VT.isInteger() && !MemVT.isInteger())
+ MemVT = MemVT.changeTypeToInteger();
+
if (AMDGPU::isShader(CallConv)) {
Register Reg = MF.addLiveIn(VA.getLocReg(), &R600::R600_Reg128RegClass);
SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
@@ -1497,11 +1499,15 @@ SDValue R600TargetLowering::LowerFormalArguments(
// thread group and global sizes.
ISD::LoadExtType Ext = ISD::NON_EXTLOAD;
if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) {
- // FIXME: This should really check the extload type, but the handling of
- // extload vector parameters seems to be broken.
+ if (VT.isFloatingPoint()) {
+ Ext = ISD::EXTLOAD;
+ } else {
+ // FIXME: This should really check the extload type, but the handling of
+ // extload vector parameters seems to be broken.
- // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
- Ext = ISD::SEXTLOAD;
+ // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
+ Ext = ISD::SEXTLOAD;
+ }
}
// Compute the offset from the value.
@@ -2179,18 +2185,20 @@ SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
}
TargetLowering::AtomicExpansionKind
-R600TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
+R600TargetLowering::shouldExpandAtomicRMWInIR(const AtomicRMWInst *RMW) const {
switch (RMW->getOperation()) {
case AtomicRMWInst::Nand:
case AtomicRMWInst::FAdd:
case AtomicRMWInst::FSub:
case AtomicRMWInst::FMax:
case AtomicRMWInst::FMin:
+ case AtomicRMWInst::USubCond:
+ case AtomicRMWInst::USubSat:
return AtomicExpansionKind::CmpXChg;
case AtomicRMWInst::UIncWrap:
case AtomicRMWInst::UDecWrap:
// FIXME: Cayman at least appears to have instructions for this, but the
- // instruction defintions appear to be missing.
+ // instruction definitions appear to be missing.
return AtomicExpansionKind::CmpXChg;
case AtomicRMWInst::Xchg: {
const DataLayout &DL = RMW->getFunction()->getDataLayout();
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.h b/llvm/lib/Target/AMDGPU/R600ISelLowering.h
index fc361c01..661efb8 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.h
@@ -116,7 +116,7 @@ private:
SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override;
TargetLowering::AtomicExpansionKind
- shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const override;
+ shouldExpandAtomicRMWInIR(const AtomicRMWInst *RMW) const override;
};
} // End namespace llvm;
diff --git a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
index 3e256cc..7f805e6 100644
--- a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -29,7 +29,7 @@ using namespace llvm;
#include "R600GenInstrInfo.inc"
R600InstrInfo::R600InstrInfo(const R600Subtarget &ST)
- : R600GenInstrInfo(ST, -1, -1), RI(), ST(ST) {}
+ : R600GenInstrInfo(ST, RI, -1, -1), RI(), ST(ST) {}
bool R600InstrInfo::isVector(const MachineInstr &MI) const {
return get(MI.getOpcode()).TSFlags & R600_InstFlag::VECTOR;
@@ -176,7 +176,7 @@ bool R600InstrInfo::usesVertexCache(unsigned Opcode) const {
}
bool R600InstrInfo::usesVertexCache(const MachineInstr &MI) const {
- const MachineFunction *MF = MI.getParent()->getParent();
+ const MachineFunction *MF = MI.getMF();
return !AMDGPU::isCompute(MF->getFunction().getCallingConv()) &&
usesVertexCache(MI.getOpcode());
}
@@ -186,7 +186,7 @@ bool R600InstrInfo::usesTextureCache(unsigned Opcode) const {
}
bool R600InstrInfo::usesTextureCache(const MachineInstr &MI) const {
- const MachineFunction *MF = MI.getParent()->getParent();
+ const MachineFunction *MF = MI.getMF();
return (AMDGPU::isCompute(MF->getFunction().getCallingConv()) &&
usesVertexCache(MI.getOpcode())) ||
usesTextureCache(MI.getOpcode());
@@ -948,7 +948,7 @@ bool R600InstrInfo::PredicateInstruction(MachineInstr &MI,
.setReg(Pred[2].getReg());
MI.getOperand(getOperandIdx(MI, R600::OpName::pred_sel_W))
.setReg(Pred[2].getReg());
- MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
+ MachineInstrBuilder MIB(*MI.getMF(), MI);
MIB.addReg(R600::PREDICATE_BIT, RegState::Implicit);
return true;
}
@@ -956,7 +956,7 @@ bool R600InstrInfo::PredicateInstruction(MachineInstr &MI,
if (PIdx != -1) {
MachineOperand &PMO = MI.getOperand(PIdx);
PMO.setReg(Pred[2].getReg());
- MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
+ MachineInstrBuilder MIB(*MI.getMF(), MI);
MIB.addReg(R600::PREDICATE_BIT, RegState::Implicit);
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/R600InstrInfo.h b/llvm/lib/Target/AMDGPU/R600InstrInfo.h
index 68bbac1..b96c17e 100644
--- a/llvm/lib/Target/AMDGPU/R600InstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/R600InstrInfo.h
@@ -326,7 +326,7 @@ public:
namespace R600 {
-int getLDSNoRetOp(uint16_t Opcode);
+int64_t getLDSNoRetOp(uint32_t Opcode);
} //End namespace AMDGPU
diff --git a/llvm/lib/Target/AMDGPU/R600Instructions.td b/llvm/lib/Target/AMDGPU/R600Instructions.td
index dda0cf6..6d7cc8b 100644
--- a/llvm/lib/Target/AMDGPU/R600Instructions.td
+++ b/llvm/lib/Target/AMDGPU/R600Instructions.td
@@ -736,22 +736,22 @@ def MIN_DX10 : R600_2OP_Helper <0x6, "MIN_DX10", fminnum>;
// XXX: Use the defs in TargetSelectionDAG.td instead of intrinsics.
def SETE : R600_2OP <
0x08, "SETE",
- [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OEQ))]
+ [(set f32:$dst, (selectcc f32:$src0, f32:$src1, fpimm_one, fpimm_zero, COND_OEQ))]
>;
def SGT : R600_2OP <
0x09, "SETGT",
- [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OGT))]
+ [(set f32:$dst, (selectcc f32:$src0, f32:$src1, fpimm_one, fpimm_zero, COND_OGT))]
>;
def SGE : R600_2OP <
0xA, "SETGE",
- [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OGE))]
+ [(set f32:$dst, (selectcc f32:$src0, f32:$src1, fpimm_one, fpimm_zero, COND_OGE))]
>;
def SNE : R600_2OP <
0xB, "SETNE",
- [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_UNE_NE))]
+ [(set f32:$dst, (selectcc f32:$src0, f32:$src1, fpimm_one, fpimm_zero, COND_UNE_NE))]
>;
def SETE_DX10 : R600_2OP <
@@ -1004,19 +1004,19 @@ class FMA_Common <bits<5> inst> : R600_3OP <
class CNDE_Common <bits<5> inst> : R600_3OP <
inst, "CNDE",
- [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OEQ))]
+ [(set f32:$dst, (selectcc f32:$src0, fpimm_zero, f32:$src1, f32:$src2, COND_OEQ))]
>;
class CNDGT_Common <bits<5> inst> : R600_3OP <
inst, "CNDGT",
- [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OGT))]
+ [(set f32:$dst, (selectcc f32:$src0, fpimm_zero, f32:$src1, f32:$src2, COND_OGT))]
> {
let Itinerary = VecALU;
}
class CNDGE_Common <bits<5> inst> : R600_3OP <
inst, "CNDGE",
- [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OGE))]
+ [(set f32:$dst, (selectcc f32:$src0, fpimm_zero, f32:$src1, f32:$src2, COND_OGE))]
> {
let Itinerary = VecALU;
}
diff --git a/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp b/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp
index 48b4e7f..ac6508c 100644
--- a/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp
@@ -55,7 +55,7 @@ void R600AsmPrinter::emitInstruction(const MachineInstr *MI) {
StringRef Err;
if (!STI.getInstrInfo()->verifyInstruction(*MI, Err)) {
- LLVMContext &C = MI->getParent()->getParent()->getFunction().getContext();
+ LLVMContext &C = MI->getMF()->getFunction().getContext();
C.emitError("Illegal instruction detected: " + Err);
MI->print(errs());
}
diff --git a/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp b/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp
index d9902e1..56d1a19 100644
--- a/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp
+++ b/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp
@@ -464,7 +464,7 @@ void R600MachineCFGStructurizer::insertCondBranchBefore(
MachineInstr *NewMI = MF->CreateMachineInstr(TII->get(NewOpcode), DL);
MBB->insert(I, NewMI);
MachineInstrBuilder MIB(*MF, NewMI);
- MIB.addReg(OldMI->getOperand(1).getReg(), false);
+ MIB.addReg(OldMI->getOperand(1).getReg());
SHOWNEWINSTR(NewMI);
//erase later oldInstr->eraseFromParent();
}
@@ -476,7 +476,7 @@ void R600MachineCFGStructurizer::insertCondBranchBefore(
MachineInstr *NewInstr = MF->CreateMachineInstr(TII->get(NewOpcode), DL);
//insert before
blk->insert(I, NewInstr);
- MachineInstrBuilder(*MF, NewInstr).addReg(RegNum, false);
+ MachineInstrBuilder(*MF, NewInstr).addReg(RegNum);
SHOWNEWINSTR(NewInstr);
}
@@ -1401,7 +1401,7 @@ void R600MachineCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingM
<< LandMBB->getNumber() << "\n";);
MachineInstr *BranchMI = getLoopendBlockBranchInstr(ExitingMBB);
assert(BranchMI && isCondBranch(BranchMI));
- DebugLoc DL = BranchMI->getDebugLoc();
+ const DebugLoc &DL = BranchMI->getDebugLoc();
MachineBasicBlock *TrueBranch = getTrueBranch(BranchMI);
MachineBasicBlock::iterator I = BranchMI;
if (TrueBranch != LandMBB)
@@ -1427,7 +1427,7 @@ void R600MachineCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingM
MachineBasicBlock::iterator I = MI;
MachineBasicBlock *TrueBranch = getTrueBranch(MI);
int OldOpcode = MI->getOpcode();
- DebugLoc DL = MI->getDebugLoc();
+ const DebugLoc &DL = MI->getDebugLoc();
bool UseContinueLogical = ((&*ContingMBB->rbegin()) == MI);
diff --git a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
index 7f75f27..9e1a97e 100644
--- a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
+++ b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
@@ -181,7 +181,7 @@ MachineInstr *R600VectorRegMerger::RebuildVector(
Register Reg = RSI->Instr->getOperand(0).getReg();
MachineBasicBlock::iterator Pos = RSI->Instr;
MachineBasicBlock &MBB = *Pos->getParent();
- DebugLoc DL = Pos->getDebugLoc();
+ const DebugLoc &DL = Pos->getDebugLoc();
Register SrcVec = BaseRSI->Instr->getOperand(0).getReg();
DenseMap<Register, unsigned> UpdatedRegToChan = BaseRSI->RegToChan;
@@ -222,8 +222,8 @@ MachineInstr *R600VectorRegMerger::RebuildVector(
// Update RSI
RSI->Instr = NewMI;
- RSI->RegToChan = UpdatedRegToChan;
- RSI->UndefReg = UpdatedUndef;
+ RSI->RegToChan = std::move(UpdatedRegToChan);
+ RSI->UndefReg = std::move(UpdatedUndef);
return NewMI;
}
diff --git a/llvm/lib/Target/AMDGPU/R600Processors.td b/llvm/lib/Target/AMDGPU/R600Processors.td
index 0265a97..dc21eb9 100644
--- a/llvm/lib/Target/AMDGPU/R600Processors.td
+++ b/llvm/lib/Target/AMDGPU/R600Processors.td
@@ -14,7 +14,7 @@ class SubtargetFeatureFetchLimit <string Value> :
>;
def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst",
- "R600ALUInst",
+ "HasR600ALUInst",
"false",
"Older version of ALU instructions encoding"
>;
@@ -29,37 +29,43 @@ def FeatureVertexCache : SubtargetFeature<"HasVertexCache",
>;
def FeatureCaymanISA : SubtargetFeature<"caymanISA",
- "CaymanISA",
+ "HasCaymanISA",
"true",
"Use Cayman ISA"
>;
def FeatureCFALUBug : SubtargetFeature<"cfalubug",
- "CFALUBug",
+ "HasCFALUBug",
"true",
"GPU has CF_ALU bug"
>;
+def FeatureMadMacF32Insts : SubtargetFeature<"mad-mac-f32-insts",
+ "HasMadMacF32Insts",
+ "true",
+ "Has v_mad_f32/v_mac_f32/v_madak_f32/v_madmk_f32 instructions"
+>;
+
class R600SubtargetFeatureGeneration <string Value, string FeatureName,
list<SubtargetFeature> Implies> :
SubtargetFeatureGeneration <Value, FeatureName, "R600Subtarget", Implies>;
def FeatureR600 : R600SubtargetFeatureGeneration<"R600", "r600",
- [FeatureR600ALUInst, FeatureFetchLimit8]
+ [FeatureR600ALUInst, FeatureFetchLimit8, FeatureMadMacF32Insts]
>;
def FeatureR700 : R600SubtargetFeatureGeneration<"R700", "r700",
- [FeatureFetchLimit16]
+ [FeatureFetchLimit16, FeatureMadMacF32Insts]
>;
def FeatureEvergreen : R600SubtargetFeatureGeneration<"EVERGREEN", "evergreen",
- [FeatureFetchLimit16, FeatureAddressableLocalMemorySize32768]
+ [FeatureFetchLimit16, FeatureAddressableLocalMemorySize32768, FeatureMadMacF32Insts]
>;
def FeatureNorthernIslands : R600SubtargetFeatureGeneration<"NORTHERN_ISLANDS",
"northern-islands",
[FeatureFetchLimit16, FeatureWavefrontSize64,
- FeatureAddressableLocalMemorySize32768]
+ FeatureAddressableLocalMemorySize32768, FeatureMadMacF32Insts]
>;
diff --git a/llvm/lib/Target/AMDGPU/R600Subtarget.h b/llvm/lib/Target/AMDGPU/R600Subtarget.h
index 22e56b6..71398ce 100644
--- a/llvm/lib/Target/AMDGPU/R600Subtarget.h
+++ b/llvm/lib/Target/AMDGPU/R600Subtarget.h
@@ -27,15 +27,14 @@ namespace llvm {
class R600Subtarget final : public R600GenSubtargetInfo,
public AMDGPUSubtarget {
+
+#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
+ bool ATTRIBUTE = DEFAULT;
+#include "R600GenSubtargetInfo.inc"
+
private:
R600InstrInfo InstrInfo;
R600FrameLowering FrameLowering;
- bool FMA = false;
- bool CaymanISA = false;
- bool CFALUBug = false;
- bool HasVertexCache = false;
- bool R600ALUInst = false;
- bool FP64 = false;
short TexVTXClauseSize = 0;
Generation Gen = R600;
R600TargetLowering TLInfo;
@@ -102,9 +101,7 @@ public:
return (getGeneration() >= EVERGREEN);
}
- bool hasCaymanISA() const {
- return CaymanISA;
- }
+ bool hasCaymanISA() const { return HasCaymanISA; }
bool hasFFBL() const {
return (getGeneration() >= EVERGREEN);
@@ -114,9 +111,15 @@ public:
return (getGeneration() >= EVERGREEN);
}
- bool hasFMA() const { return FMA; }
+ bool hasFMA() const override { return HasFMA; }
+
+ bool hasMadMacF32Insts() const override { return HasMadMacF32Insts; }
+
+ bool enablePromoteAlloca() const override { return EnablePromoteAlloca; }
+
+ bool hasFP64() const override { return HasFP64; }
- bool hasCFAluBug() const { return CFALUBug; }
+ bool hasCFALUBug() const { return HasCFALUBug; }
bool hasVertexCache() const { return HasVertexCache; }
diff --git a/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp b/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp
index c20487e..4771967 100644
--- a/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp
@@ -57,9 +57,9 @@ public:
R600CodeGenPassBuilder(R600TargetMachine &TM, const CGPassBuilderOption &Opts,
PassInstrumentationCallbacks *PIC);
- void addPreISel(AddIRPass &addPass) const;
- void addAsmPrinter(AddMachinePass &, CreateMCStreamer) const;
- Error addInstSelector(AddMachinePass &) const;
+ void addPreISel(PassManagerWrapper &PMW) const;
+ void addAsmPrinter(PassManagerWrapper &PMW, CreateMCStreamer) const;
+ Error addInstSelector(PassManagerWrapper &PMW) const;
};
//===----------------------------------------------------------------------===//
@@ -188,16 +188,16 @@ R600CodeGenPassBuilder::R600CodeGenPassBuilder(
Opt.RequiresCodeGenSCCOrder = true;
}
-void R600CodeGenPassBuilder::addPreISel(AddIRPass &addPass) const {
+void R600CodeGenPassBuilder::addPreISel(PassManagerWrapper &PMW) const {
// TODO: Add passes pre instruction selection.
}
-void R600CodeGenPassBuilder::addAsmPrinter(AddMachinePass &addPass,
+void R600CodeGenPassBuilder::addAsmPrinter(PassManagerWrapper &PMW,
CreateMCStreamer) const {
// TODO: Add AsmPrinter.
}
-Error R600CodeGenPassBuilder::addInstSelector(AddMachinePass &) const {
+Error R600CodeGenPassBuilder::addInstSelector(PassManagerWrapper &PMW) const {
// TODO: Add instruction selector.
return Error::success();
}
diff --git a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp
index 3093227..c08edc1 100644
--- a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp
@@ -108,19 +108,17 @@ InstructionCost R600TTIImpl::getCFInstrCost(unsigned Opcode,
}
}
-InstructionCost R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
- TTI::TargetCostKind CostKind,
- unsigned Index,
- const Value *Op0,
- const Value *Op1) const {
+InstructionCost R600TTIImpl::getVectorInstrCost(
+ unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index,
+ const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
switch (Opcode) {
case Instruction::ExtractElement:
case Instruction::InsertElement: {
unsigned EltSize =
DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
if (EltSize < 32) {
- return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0,
- Op1);
+ return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1,
+ VIC);
}
// Extracts are just reads of a subregister, so are free. Inserts are
@@ -131,7 +129,8 @@ InstructionCost R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
return Index == ~0u ? 2 : 0;
}
default:
- return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1);
+ return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1,
+ VIC);
}
}
diff --git a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h
index 3deae69..ade1b15 100644
--- a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h
@@ -62,10 +62,11 @@ public:
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
const Instruction *I = nullptr) const override;
using BaseT::getVectorInstrCost;
- InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy,
- TTI::TargetCostKind CostKind,
- unsigned Index, const Value *Op0,
- const Value *Op1) const override;
+ InstructionCost
+ getVectorInstrCost(unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind,
+ unsigned Index, const Value *Op0, const Value *Op1,
+ TTI::VectorInstrContext VIC =
+ TTI::VectorInstrContext::None) const override;
};
} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index ecc2824..0c7c642 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -46,6 +46,7 @@ enum {
GFX11 = 10,
GFX12 = 11,
GFX1250 = 12,
+ GFX13 = 13,
};
}
@@ -207,6 +208,7 @@ enum OperandType : unsigned {
OPERAND_REG_IMM_FP16,
OPERAND_REG_IMM_V2BF16,
OPERAND_REG_IMM_V2FP16,
+ OPERAND_REG_IMM_V2FP16_SPLAT,
OPERAND_REG_IMM_V2INT16,
OPERAND_REG_IMM_NOINLINE_V2FP16,
OPERAND_REG_IMM_V2INT32,
@@ -423,6 +425,9 @@ enum CPol {
// Volatile (used to preserve/signal operation volatility for buffer
// operations not a real instruction bit)
VOLATILE = 1 << 31,
+ // The set of "cache policy" bits used for compiler features that
+ // do not correspond to handware features.
+ VIRTUAL_BITS = VOLATILE,
};
} // namespace CPol
@@ -445,7 +450,6 @@ enum Id { // Message ID, width(4) [3:0].
ID_EARLY_PRIM_DEALLOC = 8, // added in GFX9, removed in GFX10
ID_GS_ALLOC_REQ = 9, // added in GFX9
ID_GET_DOORBELL = 10, // added in GFX9, removed in GFX11
- ID_SAVEWAVE_HAS_TDM = 10, // added in GFX1250
ID_GET_DDID = 11, // added in GFX10, removed in GFX11
ID_SYSMSG = 15,
@@ -459,6 +463,7 @@ enum Id { // Message ID, width(4) [3:0].
ID_RTN_GET_SE_AID_ID = 135,
ID_RTN_GET_CLUSTER_BARRIER_STATE = 136, // added in GFX1250
+ ID_RTN_SAVE_WAVE_HAS_TDM = 152, // added in GFX1250
ID_MASK_PreGFX11_ = 0xF,
ID_MASK_GFX11Plus_ = 0xFF
@@ -496,6 +501,14 @@ enum StreamId : unsigned { // Stream ID, (2) [9:8].
} // namespace SendMsg
+namespace WaitEvent { // Encoding of SIMM16 used in s_wait_event
+enum Id {
+ DONT_WAIT_EXPORT_READY = 1 << 0, // Only used in gfx11
+ EXPORT_READY = 1 << 1, // gfx12+
+};
+
+} // namespace WaitEvent
+
namespace Hwreg { // Encoding of SIMM16 used in s_setreg/getreg* insns.
enum Id { // HwRegCode, (6) [5:0]
@@ -520,6 +533,7 @@ enum Id { // HwRegCode, (6) [5:0]
ID_HW_ID1 = 23,
ID_HW_ID2 = 24,
ID_POPS_PACKER = 25,
+ ID_SCHED_MODE = 26,
ID_PERF_SNAPSHOT_DATA_gfx11 = 27,
ID_IB_STS2 = 28,
ID_SHADER_CYCLES = 29,
@@ -578,11 +592,11 @@ enum ModeRegisterMasks : uint32_t {
CSP_MASK = 0x7u << 29, // Bits 29..31
// GFX1250
- DST_VGPR_MSB = 1 << 12,
- SRC0_VGPR_MSB = 1 << 13,
- SRC1_VGPR_MSB = 1 << 14,
- SRC2_VGPR_MSB = 1 << 15,
- VGPR_MSB_MASK = 0xf << 12, // Bits 12..15
+ DST_VGPR_MSB = 0x3 << 12,
+ SRC0_VGPR_MSB = 0x3 << 14,
+ SRC1_VGPR_MSB = 0x3 << 16,
+ SRC2_VGPR_MSB = 0x3 << 18,
+ VGPR_MSB_MASK = 0xff << 12, // Bits 12..19
REPLAY_MODE = 1 << 25,
FLAT_SCRATCH_IS_NV = 1 << 26,
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 7793907..8782fc5 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -111,7 +111,7 @@ public:
V2SCopyInfo(unsigned Id, MachineInstr *C, unsigned Width)
: Copy(C), NumReadfirstlanes(Width / 32), ID(Id){};
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
- void dump() {
+ void dump() const {
dbgs() << ID << " : " << *Copy << "\n\tS:" << SChain.size()
<< "\n\tSV:" << NumSVCopies << "\n\tSP: " << SiblingPenalty
<< "\nScore: " << Score << "\n";
@@ -238,7 +238,7 @@ static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC,
static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI,
const SIRegisterInfo *TRI,
const SIInstrInfo *TII) {
- MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
auto &Src = MI.getOperand(1);
Register DstReg = MI.getOperand(0).getReg();
Register SrcReg = Src.getReg();
@@ -856,8 +856,8 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
}
}
- if (TRI->isVectorRegister(*MRI, PHIRes) ||
- RC0 == &AMDGPU::VReg_1RegClass) {
+ if (TRI->hasVectorRegisters(MRI->getRegClass(PHIRes)) ||
+ RC0 == &AMDGPU::VReg_1RegClass) {
LLVM_DEBUG(dbgs() << "Legalizing PHI: " << MI);
TII->legalizeOperands(MI, MDT);
}
@@ -902,14 +902,28 @@ bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI,
// really much we can do to fix this.
// Some special instructions use M0 as an input. Some even only use
// the first lane. Insert a readfirstlane and hope for the best.
- if (DstReg == AMDGPU::M0 &&
- TRI->hasVectorRegisters(MRI->getRegClass(SrcReg))) {
+ const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg);
+ if (DstReg == AMDGPU::M0 && TRI->hasVectorRegisters(SrcRC)) {
Register TmpReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
- BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
- TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg)
+
+ const MCInstrDesc &ReadFirstLaneDesc =
+ TII->get(AMDGPU::V_READFIRSTLANE_B32);
+ BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), ReadFirstLaneDesc, TmpReg)
.add(MI.getOperand(1));
+
+ unsigned SubReg = MI.getOperand(1).getSubReg();
MI.getOperand(1).setReg(TmpReg);
+ MI.getOperand(1).setSubReg(AMDGPU::NoSubRegister);
+
+ const TargetRegisterClass *OpRC = TII->getRegClass(ReadFirstLaneDesc, 1);
+ const TargetRegisterClass *ConstrainRC =
+ SubReg == AMDGPU::NoSubRegister
+ ? OpRC
+ : TRI->getMatchingSuperRegClass(SrcRC, OpRC, SubReg);
+
+ if (!MRI->constrainRegClass(SrcReg, ConstrainRC))
+ llvm_unreachable("failed to constrain register");
} else if (tryMoveVGPRConstToSGPR(MI.getOperand(1), DstReg, MI.getParent(),
MI, MI.getDebugLoc())) {
I = std::next(I);
@@ -930,7 +944,7 @@ bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI,
// s_mov_b32.
if (isSafeToFoldImmIntoCopy(&MI, MRI->getVRegDef(SrcReg), TII, SMovOp, Imm)) {
MI.getOperand(1).ChangeToImmediate(Imm);
- MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
+ MI.addImplicitDefUseOperands(*MI.getMF());
MI.setDesc(TII->get(SMovOp));
return true;
}
@@ -999,7 +1013,7 @@ void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(MachineInstr* MI) {
AnalysisWorklist.push_back(U);
}
}
- V2SCopies[Info.ID] = Info;
+ V2SCopies[Info.ID] = std::move(Info);
}
// The main function that computes the VGPR to SGPR copy score
@@ -1058,7 +1072,7 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
unsigned CurID = LoweringWorklist.pop_back_val();
auto *CurInfoIt = V2SCopies.find(CurID);
if (CurInfoIt != V2SCopies.end()) {
- V2SCopyInfo C = CurInfoIt->second;
+ const V2SCopyInfo &C = CurInfoIt->second;
LLVM_DEBUG(dbgs() << "Processing ...\n"; C.dump());
for (auto S : C.Siblings) {
auto *SibInfoIt = V2SCopies.find(S);
@@ -1075,10 +1089,10 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
}
LLVM_DEBUG(dbgs() << "V2S copy " << *C.Copy
<< " is being turned to VALU\n");
+ Copies.insert(C.Copy);
// TODO: MapVector::erase is inefficient. Do bulk removal with remove_if
// instead.
V2SCopies.erase(C.ID);
- Copies.insert(C.Copy);
}
}
@@ -1115,16 +1129,27 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_16RegClass);
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), Undef);
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), VReg32)
- .addReg(SrcReg, 0, SubReg)
+ .addReg(SrcReg, {}, SubReg)
.addImm(AMDGPU::lo16)
.addReg(Undef)
.addImm(AMDGPU::hi16);
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
.addReg(VReg32);
} else if (SrcSize == 32) {
- auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
- TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg);
- MIB.addReg(SrcReg, 0, SubReg);
+ const MCInstrDesc &ReadFirstLaneDesc =
+ TII->get(AMDGPU::V_READFIRSTLANE_B32);
+ const TargetRegisterClass *OpRC = TII->getRegClass(ReadFirstLaneDesc, 1);
+ BuildMI(*MBB, MI, MI->getDebugLoc(), ReadFirstLaneDesc, DstReg)
+ .addReg(SrcReg, {}, SubReg);
+
+ const TargetRegisterClass *ConstrainRC =
+ SubReg == AMDGPU::NoSubRegister
+ ? OpRC
+ : TRI->getMatchingSuperRegClass(MRI->getRegClass(SrcReg), OpRC,
+ SubReg);
+
+ if (!MRI->constrainRegClass(SrcReg, ConstrainRC))
+ llvm_unreachable("failed to constrain register");
} else {
auto Result = BuildMI(*MBB, MI, MI->getDebugLoc(),
TII->get(AMDGPU::REG_SEQUENCE), DstReg);
diff --git a/llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp
index d0d6792..b368e20 100644
--- a/llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp
@@ -27,9 +27,7 @@ class SIFixVGPRCopiesLegacy : public MachineFunctionPass {
public:
static char ID;
- SIFixVGPRCopiesLegacy() : MachineFunctionPass(ID) {
- initializeSIFixVGPRCopiesLegacyPass(*PassRegistry::getPassRegistry());
- }
+ SIFixVGPRCopiesLegacy() : MachineFunctionPass(ID) {}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesAll();
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 6616b30..a2fe31b 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -187,7 +187,7 @@ public:
unsigned convertToVALUOp(unsigned Opc, bool UseVOP3 = false) const {
switch (Opc) {
case AMDGPU::S_ADD_I32: {
- if (ST->hasAddNoCarry())
+ if (ST->hasAddNoCarryInsts())
return UseVOP3 ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_U32_e32;
return UseVOP3 ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
}
@@ -242,7 +242,6 @@ public:
SmallVectorImpl<FoldCandidate> &FoldList,
SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
- std::optional<int64_t> getImmOrMaterializedImm(MachineOperand &Op) const;
bool tryConstantFoldOp(MachineInstr *MI) const;
bool tryFoldCndMask(MachineInstr &MI) const;
bool tryFoldZeroHighBits(MachineInstr &MI) const;
@@ -681,6 +680,10 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
return false;
MI->setDesc(TII->get(NewMFMAOpc));
MI->untieRegOperand(0);
+ const MCInstrDesc &MCID = MI->getDesc();
+ for (unsigned I = 0; I < MI->getNumDefs(); ++I)
+ if (MCID.getOperandConstraint(I, MCOI::EARLY_CLOBBER) != -1)
+ MI->getOperand(I).setIsEarlyClobber(true);
}
// TODO: Should we try to avoid adding this to the candidate list?
@@ -709,7 +712,7 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
// Verify the register is compatible with the operand.
if (const TargetRegisterClass *OpRC =
- TII->getRegClass(MI->getDesc(), Fold.UseOpNo, TRI)) {
+ TII->getRegClass(MI->getDesc(), Fold.UseOpNo)) {
const TargetRegisterClass *NewRC =
TRI->getRegClassForReg(*MRI, New->getReg());
@@ -762,6 +765,29 @@ static void appendFoldCandidate(SmallVectorImpl<FoldCandidate> &FoldList,
FoldCandidate(MI, OpNo, FoldOp, Commuted, ShrinkOp));
}
+// Returns true if the instruction is a packed F32 instruction and the
+// corresponding scalar operand reads 32 bits and replicates the bits to both
+// channels.
+static bool isPKF32InstrReplicatesLower32BitsOfScalarOperand(
+ const GCNSubtarget *ST, MachineInstr *MI, unsigned OpNo) {
+ if (!ST->hasPKF32InstsReplicatingLower32BitsOfScalarInput())
+ return false;
+ const MCOperandInfo &OpDesc = MI->getDesc().operands()[OpNo];
+ return OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
+}
+
+// Packed FP32 instructions only read 32 bits from a scalar operand (SGPR or
+// literal) and replicates the bits to both channels. Therefore, if the hi and
+// lo are not same, we can't fold it.
+static bool checkImmOpForPKF32InstrReplicatesLower32BitsOfScalarOperand(
+ const FoldableDef &OpToFold) {
+ assert(OpToFold.isImm() && "Expected immediate operand");
+ uint64_t ImmVal = OpToFold.getEffectiveImmVal().value();
+ uint32_t Lo = Lo_32(ImmVal);
+ uint32_t Hi = Hi_32(ImmVal);
+ return Lo == Hi;
+}
+
bool SIFoldOperandsImpl::tryAddToFoldList(
SmallVectorImpl<FoldCandidate> &FoldList, MachineInstr *MI, unsigned OpNo,
const FoldableDef &OpToFold) const {
@@ -915,6 +941,13 @@ bool SIFoldOperandsImpl::tryAddToFoldList(
return true;
}
+ // Special case for PK_F32 instructions if we are trying to fold an imm to
+ // src0 or src1.
+ if (OpToFold.isImm() &&
+ isPKF32InstrReplicatesLower32BitsOfScalarOperand(ST, MI, OpNo) &&
+ !checkImmOpForPKF32InstrReplicatesLower32BitsOfScalarOperand(OpToFold))
+ return false;
+
appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
return true;
}
@@ -1129,40 +1162,14 @@ bool SIFoldOperandsImpl::tryToFoldACImm(
if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx))
return false;
- MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
if (OpToFold.isImm() && OpToFold.isOperandLegal(*TII, *UseMI, UseOpIdx)) {
+ if (isPKF32InstrReplicatesLower32BitsOfScalarOperand(ST, UseMI, UseOpIdx) &&
+ !checkImmOpForPKF32InstrReplicatesLower32BitsOfScalarOperand(OpToFold))
+ return false;
appendFoldCandidate(FoldList, UseMI, UseOpIdx, OpToFold);
return true;
}
- // TODO: Verify the following code handles subregisters correctly.
- // TODO: Handle extract of global reference
- if (UseOp.getSubReg())
- return false;
-
- if (!OpToFold.isReg())
- return false;
-
- Register UseReg = OpToFold.getReg();
- if (!UseReg.isVirtual())
- return false;
-
- // Maybe it is just a COPY of an immediate itself.
-
- // FIXME: Remove this handling. There is already special case folding of
- // immediate into copy in foldOperand. This is looking for the def of the
- // value the folding started from in the first place.
- MachineInstr *Def = MRI->getVRegDef(UseReg);
- if (Def && TII->isFoldableCopy(*Def)) {
- MachineOperand &DefOp = Def->getOperand(1);
- if (DefOp.isImm() && TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) {
- FoldableDef FoldableImm(DefOp.getImm(), OpToFold.DefRC,
- OpToFold.DefSubReg);
- appendFoldCandidate(FoldList, UseMI, UseOpIdx, FoldableImm);
- return true;
- }
- }
-
return false;
}
@@ -1309,10 +1316,11 @@ void SIFoldOperandsImpl::foldOperand(
continue;
const int SrcIdx = MovOp == AMDGPU::V_MOV_B16_t16_e64 ? 2 : 1;
- const TargetRegisterClass *MovSrcRC =
- TRI->getRegClass(TII->getOpRegClassID(MovDesc.operands()[SrcIdx]));
- if (MovSrcRC) {
+ int16_t RegClassID = TII->getOpRegClassID(MovDesc.operands()[SrcIdx]);
+ if (RegClassID != -1) {
+ const TargetRegisterClass *MovSrcRC = TRI->getRegClass(RegClassID);
+
if (UseSubReg)
MovSrcRC = TRI->getMatchingSuperRegClass(SrcRC, MovSrcRC, UseSubReg);
@@ -1351,7 +1359,7 @@ void SIFoldOperandsImpl::foldOperand(
if (MovOp == AMDGPU::V_MOV_B16_t16_e64) {
const auto &SrcOp = UseMI->getOperand(UseOpIdx);
MachineOperand NewSrcOp(SrcOp);
- MachineFunction *MF = UseMI->getParent()->getParent();
+ MachineFunction *MF = UseMI->getMF();
UseMI->removeOperand(1);
UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // src0_modifiers
UseMI->addOperand(NewSrcOp); // src0
@@ -1382,7 +1390,7 @@ void SIFoldOperandsImpl::foldOperand(
// Remove this if 16-bit SGPRs (i.e. SGPR_LO16) are added to the
// VS_16RegClass
//
- // Excerpt from AMDGPUGenRegisterInfo.inc
+ // Excerpt from AMDGPUGenRegisterInfoEnums.inc
// NoSubRegister, //0
// hi16, // 1
// lo16, // 2
@@ -1437,6 +1445,7 @@ void SIFoldOperandsImpl::foldOperand(
return;
UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
+ UseMI->clearFlag(MachineInstr::NoConvergent);
if (OpToFold.isImm()) {
UseMI->getOperand(1).ChangeToImmediate(
@@ -1468,6 +1477,7 @@ void SIFoldOperandsImpl::foldOperand(
UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
UseMI->getOperand(1).setIsKill(false);
UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
+ UseMI->clearFlag(MachineInstr::NoConvergent);
return;
}
}
@@ -1558,38 +1568,6 @@ static unsigned getMovOpc(bool IsScalar) {
return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
}
-static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) {
- MI.setDesc(NewDesc);
-
- // Remove any leftover implicit operands from mutating the instruction. e.g.
- // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
- // anymore.
- const MCInstrDesc &Desc = MI.getDesc();
- unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
- Desc.implicit_defs().size();
-
- for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
- MI.removeOperand(I);
-}
-
-std::optional<int64_t>
-SIFoldOperandsImpl::getImmOrMaterializedImm(MachineOperand &Op) const {
- if (Op.isImm())
- return Op.getImm();
-
- if (!Op.isReg() || !Op.getReg().isVirtual())
- return std::nullopt;
-
- const MachineInstr *Def = MRI->getVRegDef(Op.getReg());
- if (Def && Def->isMoveImmediate()) {
- const MachineOperand &ImmSrc = Def->getOperand(1);
- if (ImmSrc.isImm())
- return TII->extractSubregFromImm(ImmSrc.getImm(), Op.getSubReg());
- }
-
- return std::nullopt;
-}
-
// Try to simplify operations with a constant that may appear after instruction
// selection.
// TODO: See if a frame index with a fixed offset can fold.
@@ -1604,13 +1582,14 @@ bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const {
return false;
MachineOperand *Src0 = &MI->getOperand(Src0Idx);
- std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(*Src0);
+ std::optional<int64_t> Src0Imm = TII->getImmOrMaterializedImm(*Src0);
if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
Opc == AMDGPU::S_NOT_B32) &&
Src0Imm) {
MI->getOperand(1).ChangeToImmediate(~*Src0Imm);
- mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
+ TII->mutateAndCleanupImplicit(
+ *MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
return true;
}
@@ -1619,7 +1598,7 @@ bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const {
return false;
MachineOperand *Src1 = &MI->getOperand(Src1Idx);
- std::optional<int64_t> Src1Imm = getImmOrMaterializedImm(*Src1);
+ std::optional<int64_t> Src1Imm = TII->getImmOrMaterializedImm(*Src1);
if (!Src0Imm && !Src1Imm)
return false;
@@ -1638,7 +1617,7 @@ bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const {
// instruction.
MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
MI->removeOperand(Src1Idx);
- mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR)));
+ TII->mutateAndCleanupImplicit(*MI, TII->get(getMovOpc(IsSGPR)));
return true;
}
@@ -1658,11 +1637,12 @@ bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const {
if (Src1Val == 0) {
// y = or x, 0 => y = copy x
MI->removeOperand(Src1Idx);
- mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
+ TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY));
} else if (Src1Val == -1) {
// y = or x, -1 => y = v_mov_b32 -1
MI->removeOperand(Src1Idx);
- mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
+ TII->mutateAndCleanupImplicit(
+ *MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
} else
return false;
@@ -1674,11 +1654,12 @@ bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const {
if (Src1Val == 0) {
// y = and x, 0 => y = v_mov_b32 0
MI->removeOperand(Src0Idx);
- mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
+ TII->mutateAndCleanupImplicit(
+ *MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
} else if (Src1Val == -1) {
// y = and x, -1 => y = copy x
MI->removeOperand(Src1Idx);
- mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
+ TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY));
} else
return false;
@@ -1690,7 +1671,7 @@ bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const {
if (Src1Val == 0) {
// y = xor x, 0 => y = copy x
MI->removeOperand(Src1Idx);
- mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
+ TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY));
return true;
}
}
@@ -1708,11 +1689,11 @@ bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const {
MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
if (!Src1->isIdenticalTo(*Src0)) {
- std::optional<int64_t> Src1Imm = getImmOrMaterializedImm(*Src1);
+ std::optional<int64_t> Src1Imm = TII->getImmOrMaterializedImm(*Src1);
if (!Src1Imm)
return false;
- std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(*Src0);
+ std::optional<int64_t> Src0Imm = TII->getImmOrMaterializedImm(*Src0);
if (!Src0Imm || *Src0Imm != *Src1Imm)
return false;
}
@@ -1736,7 +1717,7 @@ bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const {
MI.removeOperand(Src1ModIdx);
if (Src0ModIdx != -1)
MI.removeOperand(Src0ModIdx);
- mutateCopyOp(MI, NewDesc);
+ TII->mutateAndCleanupImplicit(MI, NewDesc);
LLVM_DEBUG(dbgs() << MI);
return true;
}
@@ -1746,7 +1727,8 @@ bool SIFoldOperandsImpl::tryFoldZeroHighBits(MachineInstr &MI) const {
MI.getOpcode() != AMDGPU::V_AND_B32_e32)
return false;
- std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(MI.getOperand(1));
+ std::optional<int64_t> Src0Imm =
+ TII->getImmOrMaterializedImm(MI.getOperand(1));
if (!Src0Imm || *Src0Imm != 0xffff || !MI.getOperand(2).isReg())
return false;
@@ -1804,7 +1786,7 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
if (CopiesToReplace.empty() && FoldList.empty())
return Changed;
- MachineFunction *MF = MI.getParent()->getParent();
+ MachineFunction *MF = MI.getMF();
// Make sure we add EXEC uses to any new v_mov instructions created.
for (MachineInstr *Copy : CopiesToReplace)
Copy->addImplicitDefUseOperands(*MF);
@@ -2419,7 +2401,7 @@ bool SIFoldOperandsImpl::tryFoldRegSequence(MachineInstr &MI) {
unsigned OpIdx = Op - &UseMI->getOperand(0);
const MCInstrDesc &InstDesc = UseMI->getDesc();
- const TargetRegisterClass *OpRC = TII->getRegClass(InstDesc, OpIdx, TRI);
+ const TargetRegisterClass *OpRC = TII->getRegClass(InstDesc, OpIdx);
if (!OpRC || !TRI->isVectorSuperClass(OpRC))
return false;
@@ -2435,7 +2417,7 @@ bool SIFoldOperandsImpl::tryFoldRegSequence(MachineInstr &MI) {
} else { // This is a copy
MachineInstr *SubDef = MRI->getVRegDef(Def->getReg());
SubDef->getOperand(1).setIsKill(false);
- RS.addReg(SubDef->getOperand(1).getReg(), 0, Def->getSubReg());
+ RS.addReg(SubDef->getOperand(1).getReg(), {}, Def->getSubReg());
}
RS.addImm(SubIdx);
}
@@ -2759,7 +2741,7 @@ bool SIFoldOperandsImpl::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) {
MachineInstr *VGPRCopy =
BuildMI(*DefMBB, ++Def->getIterator(), Def->getDebugLoc(),
TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TempVGPR)
- .addReg(Reg, /* flags */ 0, SubReg);
+ .addReg(Reg, /* flags */ {}, SubReg);
// Copy back to an AGPR and use that instead of the AGPR subreg in all MOs.
Register TempAGPR = MRI->createVirtualRegister(ARC);
@@ -2793,7 +2775,6 @@ bool SIFoldOperandsImpl::run(MachineFunction &MF) {
//
// FIXME: Also need to check strictfp
bool IsIEEEMode = MFI->getMode().IEEE;
- bool HasNSZ = MFI->hasNoSignedZerosFPMath();
bool Changed = false;
for (MachineBasicBlock *MBB : depth_first(&MF)) {
@@ -2832,8 +2813,7 @@ bool SIFoldOperandsImpl::run(MachineFunction &MF) {
// TODO: Omod might be OK if there is NSZ only on the source
// instruction, and not the omod multiply.
- if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) ||
- !tryFoldOMod(MI))
+ if (IsIEEEMode || !MI.getFlag(MachineInstr::FmNsz) || !tryFoldOMod(MI))
Changed |= tryFoldClamp(MI);
}
diff --git a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
index 6b13b06..9820341 100644
--- a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
@@ -33,7 +33,7 @@ MaxClause("amdgpu-max-memory-clause", cl::Hidden, cl::init(15),
namespace {
class SIFormMemoryClausesImpl {
- using RegUse = DenseMap<unsigned, std::pair<unsigned, LaneBitmask>>;
+ using RegUse = DenseMap<unsigned, std::pair<RegState, LaneBitmask>>;
bool canBundle(const MachineInstr &MI, const RegUse &Defs,
const RegUse &Uses) const;
@@ -61,9 +61,7 @@ class SIFormMemoryClausesLegacy : public MachineFunctionPass {
public:
static char ID;
- SIFormMemoryClausesLegacy() : MachineFunctionPass(ID) {
- initializeSIFormMemoryClausesLegacyPass(*PassRegistry::getPassRegistry());
- }
+ SIFormMemoryClausesLegacy() : MachineFunctionPass(ID) {}
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -132,8 +130,8 @@ static bool isValidClauseInst(const MachineInstr &MI, bool IsVMEMClause) {
return true;
}
-static unsigned getMopState(const MachineOperand &MO) {
- unsigned S = 0;
+static RegState getMopState(const MachineOperand &MO) {
+ RegState S = {};
if (MO.isImplicit())
S |= RegState::Implicit;
if (MO.isDead())
@@ -234,7 +232,7 @@ void SIFormMemoryClausesImpl::collectRegUses(const MachineInstr &MI,
: LaneBitmask::getAll();
RegUse &Map = MO.isDef() ? Defs : Uses;
- unsigned State = getMopState(MO);
+ RegState State = getMopState(MO);
auto [Loc, Inserted] = Map.try_emplace(Reg, State, Mask);
if (!Inserted) {
Loc->second.first |= State;
@@ -349,7 +347,7 @@ bool SIFormMemoryClausesImpl::run(MachineFunction &MF) {
continue;
// Collect the register operands we should extend the live ranges of.
- SmallVector<std::tuple<unsigned, unsigned>> KillOps;
+ SmallVector<std::tuple<RegState, unsigned>> KillOps;
const LiveInterval &LI = LIS->getInterval(R.first);
if (!LI.hasSubRanges()) {
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 0189e7b..a0952b3 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -139,8 +139,8 @@ static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI,
MachineBasicBlock::iterator I, const DebugLoc &DL,
Register SpillReg, int FI, Register FrameReg,
int64_t DwordOff = 0) {
- unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
- : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
+ unsigned Opc = ST.hasFlatScratchEnabled() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
+ : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
MachineFrameInfo &FrameInfo = MF.getFrameInfo();
MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
@@ -163,8 +163,8 @@ static void buildEpilogRestore(const GCNSubtarget &ST,
MachineBasicBlock::iterator I,
const DebugLoc &DL, Register SpillReg, int FI,
Register FrameReg, int64_t DwordOff = 0) {
- unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
- : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
+ unsigned Opc = ST.hasFlatScratchEnabled() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
+ : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
MachineFrameInfo &FrameInfo = MF.getFrameInfo();
MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
@@ -591,7 +591,7 @@ Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
}
static unsigned getScratchScaleFactor(const GCNSubtarget &ST) {
- return ST.enableFlatScratch() ? 1 : ST.getWavefrontSize();
+ return ST.hasFlatScratchEnabled() ? 1 : ST.getWavefrontSize();
}
void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
@@ -629,7 +629,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
// This will return `Register()` in cases where there are no actual
// uses of the SRSRC.
Register ScratchRsrcReg;
- if (!ST.enableFlatScratch())
+ if (!ST.hasFlatScratchEnabled())
ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
// Make the selected register live throughout the function.
@@ -724,14 +724,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
FrameInfo.getMaxAlign());
MFI->setScratchReservedForDynamicVGPRs(VGPRSize);
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), FPReg)
- .addImm(AMDGPU::Hwreg::HwregEncoding::encode(
- AMDGPU::Hwreg::ID_HW_ID2, AMDGPU::Hwreg::OFFSET_ME_ID, 2));
- // The MicroEngine ID is 0 for the graphics queue, and 1 or 2 for compute
- // (3 is unused, so we ignore it). Unfortunately, S_GETREG doesn't set
- // SCC, so we need to check for 0 manually.
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32)).addImm(0).addReg(FPReg);
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CMOVK_I32), FPReg).addImm(VGPRSize);
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::GET_STACK_BASE), FPReg);
if (requiresStackPointerReference(MF)) {
Register SPReg = MFI->getStackPtrOffsetReg();
assert(SPReg != AMDGPU::SP_REG);
@@ -755,10 +748,10 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
bool NeedsFlatScratchInit =
MFI->getUserSGPRInfo().hasFlatScratchInit() &&
(MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
- (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch()));
+ (!allStackObjectsAreDead(FrameInfo) && ST.hasFlatScratchEnabled()));
if ((NeedsFlatScratchInit || ScratchRsrcReg) &&
- PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) {
+ PreloadedScratchWaveOffsetReg && !ST.hasArchitectedFlatScratch()) {
MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
}
@@ -772,6 +765,17 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
PreloadedScratchRsrcReg,
ScratchRsrcReg, ScratchWaveOffsetReg);
}
+
+ if (ST.hasWaitXcnt()) {
+ // Set REPLAY_MODE (bit 25) in MODE register to enable multi-group XNACK
+ // replay. This aligns hardware behavior with the compiler's s_wait_xcnt
+ // insertion logic, which assumes multi-group mode by default.
+ unsigned RegEncoding =
+ AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 25, 1);
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
+ .addImm(1)
+ .addImm(RegEncoding);
+ }
}
// Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
@@ -1034,16 +1038,13 @@ void SIFrameLowering::emitCSRSpillStores(
StoreWWMRegisters(WWMCalleeSavedRegs);
if (FuncInfo->isWholeWaveFunction()) {
- // SI_WHOLE_WAVE_FUNC_SETUP has outlived its purpose, so we can remove
- // it now. If we have already saved some WWM CSR registers, then the EXEC is
- // already -1 and we don't need to do anything else. Otherwise, set EXEC to
- // -1 here.
+ // If we have already saved some WWM CSR registers, then the EXEC is already
+ // -1 and we don't need to do anything else. Otherwise, set EXEC to -1 here.
if (!ScratchExecCopy)
buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, /*IsProlog*/ true,
/*EnableInactiveLanes*/ true);
else if (WWMCalleeSavedRegs.empty())
EnableAllLanes();
- TII->getWholeWaveFunctionSetup(MF)->eraseFromParent();
} else if (ScratchExecCopy) {
// FIXME: Split block and make terminator.
BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg)
@@ -1340,6 +1341,11 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
"Needed to save BP but didn't save it anywhere");
assert((HasBP || !BPSaved) && "Saved BP but didn't need it");
+
+ if (FuncInfo->isWholeWaveFunction()) {
+ // SI_WHOLE_WAVE_FUNC_SETUP has outlived its purpose.
+ TII->getWholeWaveFunctionSetup(MF)->eraseFromParent();
+ }
}
void SIFrameLowering::emitEpilogue(MachineFunction &MF,
@@ -1831,9 +1837,7 @@ void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
static void assignSlotsUsingVGPRBlocks(MachineFunction &MF,
const GCNSubtarget &ST,
- std::vector<CalleeSavedInfo> &CSI,
- unsigned &MinCSFrameIndex,
- unsigned &MaxCSFrameIndex) {
+ std::vector<CalleeSavedInfo> &CSI) {
SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
MachineFrameInfo &MFI = MF.getFrameInfo();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
@@ -1902,10 +1906,7 @@ static void assignSlotsUsingVGPRBlocks(MachineFunction &MF,
int FrameIdx =
MFI.CreateStackObject(BlockSize, TRI->getSpillAlign(*BlockRegClass),
/*isSpillSlot=*/true);
- if ((unsigned)FrameIdx < MinCSFrameIndex)
- MinCSFrameIndex = FrameIdx;
- if ((unsigned)FrameIdx > MaxCSFrameIndex)
- MaxCSFrameIndex = FrameIdx;
+ MFI.setIsCalleeSavedObjectIndex(FrameIdx, true);
CSIt->setFrameIdx(FrameIdx);
CSIt->setReg(RegBlock);
@@ -1915,8 +1916,7 @@ static void assignSlotsUsingVGPRBlocks(MachineFunction &MF,
bool SIFrameLowering::assignCalleeSavedSpillSlots(
MachineFunction &MF, const TargetRegisterInfo *TRI,
- std::vector<CalleeSavedInfo> &CSI, unsigned &MinCSFrameIndex,
- unsigned &MaxCSFrameIndex) const {
+ std::vector<CalleeSavedInfo> &CSI) const {
if (CSI.empty())
return true; // Early exit if no callee saved registers are modified!
@@ -1924,12 +1924,12 @@ bool SIFrameLowering::assignCalleeSavedSpillSlots(
bool UseVGPRBlocks = ST.useVGPRBlockOpsForCSR();
if (UseVGPRBlocks)
- assignSlotsUsingVGPRBlocks(MF, ST, CSI, MinCSFrameIndex, MaxCSFrameIndex);
+ assignSlotsUsingVGPRBlocks(MF, ST, CSI);
- return assignCalleeSavedSpillSlots(MF, TRI, CSI) || UseVGPRBlocks;
+ return assignCalleeSavedSpillSlotsImpl(MF, TRI, CSI) || UseVGPRBlocks;
}
-bool SIFrameLowering::assignCalleeSavedSpillSlots(
+bool SIFrameLowering::assignCalleeSavedSpillSlotsImpl(
MachineFunction &MF, const TargetRegisterInfo *TRI,
std::vector<CalleeSavedInfo> &CSI) const {
if (CSI.empty())
@@ -1986,7 +1986,7 @@ bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP(
// TODO: We could try sorting the objects to find a hole in the first bytes
// rather than allocating as close to possible. This could save a lot of space
// on frames with alignment requirements.
- if (ST.enableFlatScratch()) {
+ if (ST.hasFlatScratchEnabled()) {
if (TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS,
SIInstrFlags::FlatScratch))
return false;
@@ -2168,7 +2168,9 @@ bool SIFrameLowering::hasFPImpl(const MachineFunction &MF) const {
return MFI.getStackSize() != 0;
}
- return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() ||
+ return (frameTriviallyRequiresSP(MFI) &&
+ !MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) ||
+ MFI.isFrameAddressTaken() ||
MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment(
MF) ||
mayReserveScratchForCWSR(MF) ||
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
index a727729..4c1cf3c 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -49,11 +49,9 @@ public:
const TargetRegisterInfo *TRI,
std::vector<CalleeSavedInfo> &CSI) const override;
- bool assignCalleeSavedSpillSlots(MachineFunction &MF,
- const TargetRegisterInfo *TRI,
- std::vector<CalleeSavedInfo> &CSI,
- unsigned &MinCSFrameIndex,
- unsigned &MaxCSFrameIndex) const override;
+ bool assignCalleeSavedSpillSlotsImpl(MachineFunction &MF,
+ const TargetRegisterInfo *TRI,
+ std::vector<CalleeSavedInfo> &CSI) const;
bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a2841c11..fe1d24f 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -15,6 +15,7 @@
#include "AMDGPU.h"
#include "AMDGPUInstrInfo.h"
#include "AMDGPULaneMaskUtils.h"
+#include "AMDGPUSelectionDAGInfo.h"
#include "AMDGPUTargetMachine.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
@@ -34,6 +35,8 @@
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachinePassManager.h"
+#include "llvm/CodeGen/PseudoSourceValueManager.h"
#include "llvm/CodeGen/SDPatternMatch.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/IRBuilder.h"
@@ -86,69 +89,78 @@ static unsigned findFirstFreeSGPR(CCState &CCInfo) {
SITargetLowering::SITargetLowering(const TargetMachine &TM,
const GCNSubtarget &STI)
- : AMDGPUTargetLowering(TM, STI), Subtarget(&STI) {
+ : AMDGPUTargetLowering(TM, STI, STI), Subtarget(&STI) {
addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
- addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
+
+ const SIRegisterInfo *TRI = STI.getRegisterInfo();
+ const TargetRegisterClass *V32RegClass =
+ TRI->getDefaultVectorSuperClassForBitWidth(32);
+ addRegisterClass(MVT::f32, V32RegClass);
addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
- const SIRegisterInfo *TRI = STI.getRegisterInfo();
- const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
+ const TargetRegisterClass *V64RegClass =
+ TRI->getDefaultVectorSuperClassForBitWidth(64);
addRegisterClass(MVT::f64, V64RegClass);
addRegisterClass(MVT::v2f32, V64RegClass);
addRegisterClass(MVT::Untyped, V64RegClass);
addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
- addRegisterClass(MVT::v3f32, &AMDGPU::VReg_96RegClass);
+ addRegisterClass(MVT::v3f32, TRI->getDefaultVectorSuperClassForBitWidth(96));
addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
- addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
+ addRegisterClass(MVT::v4f32, TRI->getDefaultVectorSuperClassForBitWidth(128));
addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
- addRegisterClass(MVT::v5f32, &AMDGPU::VReg_160RegClass);
+ addRegisterClass(MVT::v5f32, TRI->getDefaultVectorSuperClassForBitWidth(160));
addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
- addRegisterClass(MVT::v6f32, &AMDGPU::VReg_192RegClass);
+ addRegisterClass(MVT::v6f32, TRI->getDefaultVectorSuperClassForBitWidth(192));
addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
- addRegisterClass(MVT::v3f64, &AMDGPU::VReg_192RegClass);
+ addRegisterClass(MVT::v3f64, TRI->getDefaultVectorSuperClassForBitWidth(192));
addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
- addRegisterClass(MVT::v7f32, &AMDGPU::VReg_224RegClass);
+ addRegisterClass(MVT::v7f32, TRI->getDefaultVectorSuperClassForBitWidth(224));
addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
- addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
+ addRegisterClass(MVT::v8f32, TRI->getDefaultVectorSuperClassForBitWidth(256));
addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
- addRegisterClass(MVT::v4f64, &AMDGPU::VReg_256RegClass);
+ addRegisterClass(MVT::v4f64, TRI->getDefaultVectorSuperClassForBitWidth(256));
addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
- addRegisterClass(MVT::v9f32, &AMDGPU::VReg_288RegClass);
+ addRegisterClass(MVT::v9f32, TRI->getDefaultVectorSuperClassForBitWidth(288));
addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
- addRegisterClass(MVT::v10f32, &AMDGPU::VReg_320RegClass);
+ addRegisterClass(MVT::v10f32,
+ TRI->getDefaultVectorSuperClassForBitWidth(320));
addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
- addRegisterClass(MVT::v11f32, &AMDGPU::VReg_352RegClass);
+ addRegisterClass(MVT::v11f32,
+ TRI->getDefaultVectorSuperClassForBitWidth(352));
addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
- addRegisterClass(MVT::v12f32, &AMDGPU::VReg_384RegClass);
+ addRegisterClass(MVT::v12f32,
+ TRI->getDefaultVectorSuperClassForBitWidth(384));
addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
- addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
+ addRegisterClass(MVT::v16f32,
+ TRI->getDefaultVectorSuperClassForBitWidth(512));
addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
- addRegisterClass(MVT::v8f64, &AMDGPU::VReg_512RegClass);
+ addRegisterClass(MVT::v8f64, TRI->getDefaultVectorSuperClassForBitWidth(512));
addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
- addRegisterClass(MVT::v16f64, &AMDGPU::VReg_1024RegClass);
+ addRegisterClass(MVT::v16f64,
+ TRI->getDefaultVectorSuperClassForBitWidth(1024));
if (Subtarget->has16BitInsts()) {
if (Subtarget->useRealTrue16Insts()) {
@@ -180,7 +192,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
}
addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
- addRegisterClass(MVT::v32f32, &AMDGPU::VReg_1024RegClass);
+ addRegisterClass(MVT::v32f32,
+ TRI->getDefaultVectorSuperClassForBitWidth(1024));
computeRegisterProperties(Subtarget->getRegisterInfo());
@@ -217,9 +230,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT,
ISD::FROUND, ISD::FROUNDEVEN, ISD::FFLOOR, ISD::FCANONICALIZE,
ISD::SETCC}) {
- // FIXME: The promoted to type shouldn't need to be explicit
setOperationAction(Opc, MVT::bf16, Promote);
- AddPromotedToType(Opc, MVT::bf16, MVT::f32);
}
setOperationAction(ISD::FP_ROUND, MVT::bf16, Expand);
@@ -263,6 +274,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
+ setOperationAction(ISD::ExternalSymbol, {MVT::i32, MVT::i64}, Custom);
setOperationAction(ISD::SELECT, MVT::i1, Promote);
setOperationAction(ISD::SELECT, MVT::i64, Custom);
@@ -298,7 +310,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::BR_CC,
{MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
- setOperationAction({ISD::UADDO, ISD::USUBO}, MVT::i32, Legal);
+ setOperationAction({ISD::ABS, ISD::UADDO, ISD::USUBO}, MVT::i32, Legal);
setOperationAction({ISD::UADDO_CARRY, ISD::USUBO_CARRY}, MVT::i32, Legal);
@@ -492,6 +504,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
if (Subtarget->has16BitInsts()) {
setOperationAction({ISD::FPOW, ISD::FPOWI}, MVT::f16, Promote);
setOperationAction({ISD::FLOG, ISD::FEXP, ISD::FLOG10}, MVT::f16, Custom);
+ setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
+ setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Legal);
+ setOperationAction(ISD::FCANONICALIZE, MVT::f16, Legal);
} else {
setOperationAction(ISD::FSQRT, MVT::f16, Custom);
}
@@ -499,21 +514,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
if (Subtarget->hasMadMacF32Insts())
setOperationAction(ISD::FMAD, MVT::f32, Legal);
- if (!Subtarget->hasBFI())
- // fcopysign can be done in a single instruction with BFI.
- setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
-
- if (!Subtarget->hasBCNT(32))
- setOperationAction(ISD::CTPOP, MVT::i32, Expand);
-
- if (!Subtarget->hasBCNT(64))
- setOperationAction(ISD::CTPOP, MVT::i64, Expand);
-
- if (Subtarget->hasFFBH())
- setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, MVT::i32, Custom);
-
- if (Subtarget->hasFFBL())
- setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom);
+ setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, MVT::i32, Custom);
+ setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom);
// We only really have 32-bit BFE instructions (and 16-bit on VI).
//
@@ -523,14 +525,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
// span the midpoint are probably relatively rare, so don't worry about them
// for now.
- if (Subtarget->hasBFE())
- setHasExtractBitsInsn(true);
+ setHasExtractBitsInsn(true);
// Clamp modifier on add/sub
if (Subtarget->hasIntClamp())
setOperationAction({ISD::UADDSAT, ISD::USUBSAT}, MVT::i32, Legal);
- if (Subtarget->hasAddNoCarry())
+ if (Subtarget->hasAddNoCarryInsts())
setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
Legal);
@@ -562,6 +563,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
+ setOperationAction({ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT}, MVT::i32,
+ Custom);
+ setOperationAction({ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT}, MVT::i16,
+ Custom);
+ setOperationAction({ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT}, MVT::i1,
+ Custom);
+
// Custom lower these because we can't specify a rule based on an illegal
// source bf16.
setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f32, Custom);
@@ -623,8 +631,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
if (Subtarget->hasBF16TransInsts())
setOperationAction({ISD::FCOS, ISD::FSIN, ISD::FDIV}, MVT::bf16, Custom);
- setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::f16, Promote);
- setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::bf16, Promote);
+ setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
+ ISD::FP_TO_UINT_SAT},
+ MVT::f16, Promote);
+ setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
+ ISD::FP_TO_UINT_SAT},
+ MVT::bf16, Promote);
// F16 - VOP2 Actions.
setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
@@ -657,6 +669,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
break;
case ISD::EXTRACT_SUBVECTOR:
case ISD::CONCAT_VECTORS:
+ case ISD::FSIN:
+ case ISD::FCOS:
setOperationAction(Op, VT, Custom);
break;
default:
@@ -1016,6 +1030,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
ISD::SCALAR_TO_VECTOR,
ISD::ZERO_EXTEND,
ISD::SIGN_EXTEND_INREG,
+ ISD::ANY_EXTEND,
ISD::EXTRACT_VECTOR_ELT,
ISD::INSERT_VECTOR_ELT,
ISD::FCOPYSIGN});
@@ -1047,6 +1062,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
ISD::ATOMIC_LOAD_FMAX,
ISD::ATOMIC_LOAD_UINC_WRAP,
ISD::ATOMIC_LOAD_UDEC_WRAP,
+ ISD::ATOMIC_LOAD_USUB_COND,
+ ISD::ATOMIC_LOAD_USUB_SAT,
ISD::INTRINSIC_VOID,
ISD::INTRINSIC_W_CHAIN});
@@ -1109,12 +1126,9 @@ MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
EVT ScalarVT = VT.getScalarType();
unsigned Size = ScalarVT.getSizeInBits();
if (Size == 16) {
- if (Subtarget->has16BitInsts()) {
- if (VT.isInteger())
- return MVT::v2i16;
- return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
- }
- return VT.isInteger() ? MVT::i32 : MVT::f32;
+ return Subtarget->has16BitInsts()
+ ? MVT::getVectorVT(ScalarVT.getSimpleVT(), 2)
+ : MVT::i32;
}
if (Size < 16)
@@ -1122,6 +1136,9 @@ MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
}
+ if (!Subtarget->has16BitInsts() && VT.getSizeInBits() == 16)
+ return MVT::i32;
+
if (VT.getSizeInBits() > 32)
return MVT::i32;
@@ -1140,7 +1157,7 @@ unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
unsigned Size = ScalarVT.getSizeInBits();
// FIXME: Should probably promote 8-bit vectors to i16.
- if (Size == 16 && Subtarget->has16BitInsts())
+ if (Size == 16)
return (NumElts + 1) / 2;
if (Size <= 32)
@@ -1164,16 +1181,13 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
// FIXME: We should fix the ABI to be the same on targets without 16-bit
// support, but unless we can properly handle 3-vectors, it will be still be
// inconsistent.
- if (Size == 16 && Subtarget->has16BitInsts()) {
- if (ScalarVT == MVT::bf16) {
- RegisterVT = MVT::i32;
- IntermediateVT = MVT::v2bf16;
- } else {
- RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
- IntermediateVT = RegisterVT;
- }
+ if (Size == 16) {
+ MVT SimpleIntermediateVT =
+ MVT::getVectorVT(ScalarVT.getSimpleVT(), ElementCount::getFixed(2));
+ IntermediateVT = SimpleIntermediateVT;
+ RegisterVT = Subtarget->has16BitInsts() ? SimpleIntermediateVT : MVT::i32;
NumIntermediates = (NumElts + 1) / 2;
- return NumIntermediates;
+ return (NumElts + 1) / 2;
}
if (Size == 32) {
@@ -1279,57 +1293,61 @@ static unsigned getIntrMemWidth(unsigned IntrID) {
case Intrinsic::amdgcn_global_store_async_from_lds_b32:
case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
+ case Intrinsic::amdgcn_flat_load_monitor_b32:
+ case Intrinsic::amdgcn_global_load_monitor_b32:
return 32;
case Intrinsic::amdgcn_global_load_async_to_lds_b64:
case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
case Intrinsic::amdgcn_global_store_async_from_lds_b64:
case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
+ case Intrinsic::amdgcn_flat_load_monitor_b64:
+ case Intrinsic::amdgcn_global_load_monitor_b64:
return 64;
case Intrinsic::amdgcn_global_load_async_to_lds_b128:
case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
case Intrinsic::amdgcn_global_store_async_from_lds_b128:
case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
+ case Intrinsic::amdgcn_flat_load_monitor_b128:
+ case Intrinsic::amdgcn_global_load_monitor_b128:
return 128;
default:
llvm_unreachable("Unknown width");
}
}
-static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad,
- TargetLoweringBase::IntrinsicInfo &Info) {
- Value *OrderingArg = CI.getArgOperand(IsLoad ? 1 : 2);
+static AtomicOrdering parseAtomicOrderingCABIArg(const CallBase &CI,
+ unsigned ArgIdx) {
+ Value *OrderingArg = CI.getArgOperand(ArgIdx);
unsigned Ord = cast<ConstantInt>(OrderingArg)->getZExtValue();
switch (AtomicOrderingCABI(Ord)) {
case AtomicOrderingCABI::acquire:
- Info.order = AtomicOrdering::Acquire;
+ return AtomicOrdering::Acquire;
break;
case AtomicOrderingCABI::release:
- Info.order = AtomicOrdering::Release;
+ return AtomicOrdering::Release;
break;
case AtomicOrderingCABI::seq_cst:
- Info.order = AtomicOrdering::SequentiallyConsistent;
+ return AtomicOrdering::SequentiallyConsistent;
break;
default:
- Info.order = AtomicOrdering::Monotonic;
- break;
+ return AtomicOrdering::Monotonic;
}
+}
- Info.flags =
- (IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore);
- Info.flags |= MOCooperative;
-
+static unsigned parseSyncscopeMDArg(const CallBase &CI, unsigned ArgIdx) {
MDNode *ScopeMD = cast<MDNode>(
- cast<MetadataAsValue>(CI.getArgOperand(IsLoad ? 2 : 3))->getMetadata());
+ cast<MetadataAsValue>(CI.getArgOperand(ArgIdx))->getMetadata());
StringRef Scope = cast<MDString>(ScopeMD->getOperand(0))->getString();
- Info.ssid = CI.getContext().getOrInsertSyncScopeID(Scope);
+ return CI.getContext().getOrInsertSyncScopeID(Scope);
}
-bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
- const CallInst &CI,
+void SITargetLowering::getTgtMemIntrinsic(SmallVectorImpl<IntrinsicInfo> &Infos,
+ const CallBase &CI,
MachineFunction &MF,
unsigned IntrID) const {
+ IntrinsicInfo Info;
Info.flags = MachineMemOperand::MONone;
if (CI.hasMetadata(LLVMContext::MD_invariant_load))
Info.flags |= MachineMemOperand::MOInvariant;
@@ -1343,7 +1361,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Intrinsic::getFnAttributes(CI.getContext(), (Intrinsic::ID)IntrID);
MemoryEffects ME = Attr.getMemoryEffects();
if (ME.doesNotAccessMemory())
- return false;
+ return;
// TODO: Should images get their own address space?
Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
@@ -1433,13 +1451,35 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
}
break;
case Intrinsic::amdgcn_raw_buffer_load_lds:
+ case Intrinsic::amdgcn_raw_buffer_load_async_lds:
case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
+ case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
case Intrinsic::amdgcn_struct_buffer_load_lds:
- case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
+ case Intrinsic::amdgcn_struct_buffer_load_async_lds:
+ case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
+ case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
+
+ // Entry 0: Load from buffer.
+ // Don't set an offset, since the pointer value always represents the
+ // base of the buffer.
Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
- Info.ptrVal = CI.getArgOperand(1);
- return true;
+ Info.flags &= ~MachineMemOperand::MOStore;
+ Infos.push_back(Info);
+
+ // Entry 1: Store to LDS.
+ // Instruction offset is applied, and an additional per-lane offset
+ // which we simulate using a larger memory type.
+ Info.memVT = EVT::getIntegerVT(
+ CI.getContext(), Width * 8 * Subtarget->getWavefrontSize());
+ Info.ptrVal = CI.getArgOperand(1); // LDS destination pointer
+ Info.offset = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 2))
+ ->getZExtValue();
+ Info.fallbackAddressSpace = AMDGPUAS::LOCAL_ADDRESS;
+ Info.flags &= ~MachineMemOperand::MOLoad;
+ Info.flags |= MachineMemOperand::MOStore;
+ Infos.push_back(Info);
+ return;
}
case Intrinsic::amdgcn_raw_atomic_buffer_load:
case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
@@ -1449,11 +1489,13 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
memVTFromLoadIntrReturn(*this, MF.getDataLayout(), CI.getType(),
std::numeric_limits<unsigned>::max());
Info.flags &= ~MachineMemOperand::MOStore;
- return true;
+ Infos.push_back(Info);
+ return;
}
}
}
- return true;
+ Infos.push_back(Info);
+ return;
}
switch (IntrID) {
@@ -1469,7 +1511,8 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
if (!Vol->isZero())
Info.flags |= MachineMemOperand::MOVolatile;
- return true;
+ Infos.push_back(Info);
+ return;
}
case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
@@ -1478,7 +1521,8 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.ptrVal = nullptr;
Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
- return true;
+ Infos.push_back(Info);
+ return;
}
case Intrinsic::amdgcn_ds_append:
case Intrinsic::amdgcn_ds_consume: {
@@ -1492,7 +1536,8 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
if (!Vol->isZero())
Info.flags |= MachineMemOperand::MOVolatile;
- return true;
+ Infos.push_back(Info);
+ return;
}
case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
@@ -1505,16 +1550,8 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.size = 8;
Info.align.reset();
Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
- return true;
- }
- case Intrinsic::amdgcn_global_atomic_csub: {
- Info.opc = ISD::INTRINSIC_W_CHAIN;
- Info.memVT = MVT::getVT(CI.getType());
- Info.ptrVal = CI.getOperand(0);
- Info.align.reset();
- Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
- MachineMemOperand::MOVolatile;
- return true;
+ Infos.push_back(Info);
+ return;
}
case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
case Intrinsic::amdgcn_image_bvh_intersect_ray:
@@ -1530,14 +1567,14 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.align.reset();
Info.flags |=
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable;
- return true;
+ Infos.push_back(Info);
+ return;
}
case Intrinsic::amdgcn_global_atomic_fmin_num:
case Intrinsic::amdgcn_global_atomic_fmax_num:
case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
case Intrinsic::amdgcn_flat_atomic_fmin_num:
- case Intrinsic::amdgcn_flat_atomic_fmax_num:
- case Intrinsic::amdgcn_atomic_cond_sub_u32: {
+ case Intrinsic::amdgcn_flat_atomic_fmax_num: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(CI.getType());
Info.ptrVal = CI.getOperand(0);
@@ -1545,14 +1582,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
MachineMemOperand::MODereferenceable |
MachineMemOperand::MOVolatile;
- return true;
+ Infos.push_back(Info);
+ return;
}
- case Intrinsic::amdgcn_flat_load_monitor_b32:
- case Intrinsic::amdgcn_flat_load_monitor_b64:
- case Intrinsic::amdgcn_flat_load_monitor_b128:
- case Intrinsic::amdgcn_global_load_monitor_b32:
- case Intrinsic::amdgcn_global_load_monitor_b64:
- case Intrinsic::amdgcn_global_load_monitor_b128:
case Intrinsic::amdgcn_cluster_load_b32:
case Intrinsic::amdgcn_cluster_load_b64:
case Intrinsic::amdgcn_cluster_load_b128:
@@ -1573,7 +1605,24 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.ptrVal = CI.getOperand(0);
Info.align.reset();
Info.flags |= MachineMemOperand::MOLoad;
- return true;
+ Infos.push_back(Info);
+ return;
+ }
+ case Intrinsic::amdgcn_flat_load_monitor_b32:
+ case Intrinsic::amdgcn_flat_load_monitor_b64:
+ case Intrinsic::amdgcn_flat_load_monitor_b128:
+ case Intrinsic::amdgcn_global_load_monitor_b32:
+ case Intrinsic::amdgcn_global_load_monitor_b64:
+ case Intrinsic::amdgcn_global_load_monitor_b128: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
+ Info.ptrVal = CI.getOperand(0);
+ Info.align.reset();
+ Info.flags = MachineMemOperand::MOLoad;
+ Info.order = parseAtomicOrderingCABIArg(CI, 1);
+ Info.ssid = parseSyncscopeMDArg(CI, 2);
+ Infos.push_back(Info);
+ return;
}
case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
@@ -1582,8 +1631,11 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
Info.ptrVal = CI.getOperand(0);
Info.align.reset();
- getCoopAtomicOperandsInfo(CI, /*IsLoad=*/true, Info);
- return true;
+ Info.flags = (MachineMemOperand::MOLoad | MOCooperative);
+ Info.order = parseAtomicOrderingCABIArg(CI, 1);
+ Info.ssid = parseSyncscopeMDArg(CI, 2);
+ Infos.push_back(Info);
+ return;
}
case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
@@ -1592,8 +1644,11 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
Info.ptrVal = CI.getArgOperand(0);
Info.align.reset();
- getCoopAtomicOperandsInfo(CI, /*IsLoad=*/false, Info);
- return true;
+ Info.flags = (MachineMemOperand::MOStore | MOCooperative);
+ Info.order = parseAtomicOrderingCABIArg(CI, 2);
+ Info.ssid = parseSyncscopeMDArg(CI, 3);
+ Infos.push_back(Info);
+ return;
}
case Intrinsic::amdgcn_ds_gws_init:
case Intrinsic::amdgcn_ds_gws_barrier:
@@ -1618,7 +1673,8 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags |= MachineMemOperand::MOLoad;
else
Info.flags |= MachineMemOperand::MOStore;
- return true;
+ Infos.push_back(Info);
+ return;
}
case Intrinsic::amdgcn_global_load_async_to_lds_b8:
case Intrinsic::amdgcn_global_load_async_to_lds_b32:
@@ -1628,30 +1684,68 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
+ // Entry 0: Load from source (global/flat).
Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
- Info.ptrVal = CI.getArgOperand(1);
- Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
- return true;
+ Info.ptrVal = CI.getArgOperand(0); // Global pointer
+ Info.offset = cast<ConstantInt>(CI.getArgOperand(2))->getSExtValue();
+ Info.flags |= MachineMemOperand::MOLoad;
+ Infos.push_back(Info);
+
+ // Entry 1: Store to LDS (same offset).
+ Info.flags &= ~MachineMemOperand::MOLoad;
+ Info.flags |= MachineMemOperand::MOStore;
+ Info.ptrVal = CI.getArgOperand(1); // LDS pointer
+ Infos.push_back(Info);
+ return;
}
case Intrinsic::amdgcn_global_store_async_from_lds_b8:
case Intrinsic::amdgcn_global_store_async_from_lds_b32:
case Intrinsic::amdgcn_global_store_async_from_lds_b64:
case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
+ // Entry 0: Load from LDS.
Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
- Info.ptrVal = CI.getArgOperand(0);
- Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
- return true;
+ Info.ptrVal = CI.getArgOperand(1); // LDS pointer
+ Info.offset = cast<ConstantInt>(CI.getArgOperand(2))->getSExtValue();
+ Info.flags |= MachineMemOperand::MOLoad;
+ Infos.push_back(Info);
+
+ // Entry 1: Store to global (same offset).
+ Info.flags &= ~MachineMemOperand::MOLoad;
+ Info.flags |= MachineMemOperand::MOStore;
+ Info.ptrVal = CI.getArgOperand(0); // Global pointer
+ Infos.push_back(Info);
+ return;
}
case Intrinsic::amdgcn_load_to_lds:
- case Intrinsic::amdgcn_global_load_lds: {
- Info.opc = ISD::INTRINSIC_VOID;
+ case Intrinsic::amdgcn_load_async_to_lds:
+ case Intrinsic::amdgcn_global_load_lds:
+ case Intrinsic::amdgcn_global_load_async_lds: {
unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
+ auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
+ bool IsVolatile = Aux->getZExtValue() & AMDGPU::CPol::VOLATILE;
+
+ // Entry 0: Load from source (global/flat).
+ Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
- Info.ptrVal = CI.getArgOperand(1);
- Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
- return true;
+ Info.ptrVal = CI.getArgOperand(0); // Source pointer
+ Info.offset = cast<ConstantInt>(CI.getArgOperand(3))->getSExtValue();
+ Info.flags |= MachineMemOperand::MOLoad;
+ if (IsVolatile)
+ Info.flags |= MachineMemOperand::MOVolatile;
+ Infos.push_back(Info);
+
+ // Entry 1: Store to LDS.
+ // Same offset from the instruction, but an additional per-lane offset is
+ // added. Represent that using a wider memory type.
+ Info.memVT = EVT::getIntegerVT(CI.getContext(),
+ Width * 8 * Subtarget->getWavefrontSize());
+ Info.ptrVal = CI.getArgOperand(1); // LDS destination pointer
+ Info.flags &= ~MachineMemOperand::MOLoad;
+ Info.flags |= MachineMemOperand::MOStore;
+ Infos.push_back(Info);
+ return;
}
case Intrinsic::amdgcn_ds_bvh_stack_rtn:
case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
@@ -1671,7 +1765,8 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.align = Align(4);
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
- return true;
+ Infos.push_back(Info);
+ return;
}
case Intrinsic::amdgcn_s_prefetch_data:
case Intrinsic::amdgcn_flat_prefetch:
@@ -1680,10 +1775,11 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
Info.ptrVal = CI.getArgOperand(0);
Info.flags |= MachineMemOperand::MOLoad;
- return true;
+ Infos.push_back(Info);
+ return;
}
default:
- return false;
+ return;
}
}
@@ -1709,7 +1805,6 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
Type *&AccessTy) const {
Value *Ptr = nullptr;
switch (II->getIntrinsicID()) {
- case Intrinsic::amdgcn_atomic_cond_sub_u32:
case Intrinsic::amdgcn_cluster_load_b128:
case Intrinsic::amdgcn_cluster_load_b64:
case Intrinsic::amdgcn_cluster_load_b32:
@@ -1729,16 +1824,9 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
case Intrinsic::amdgcn_flat_atomic_fmax_num:
case Intrinsic::amdgcn_flat_atomic_fmin_num:
- case Intrinsic::amdgcn_flat_load_monitor_b128:
- case Intrinsic::amdgcn_flat_load_monitor_b32:
- case Intrinsic::amdgcn_flat_load_monitor_b64:
- case Intrinsic::amdgcn_global_atomic_csub:
case Intrinsic::amdgcn_global_atomic_fmax_num:
case Intrinsic::amdgcn_global_atomic_fmin_num:
case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
- case Intrinsic::amdgcn_global_load_monitor_b128:
- case Intrinsic::amdgcn_global_load_monitor_b32:
- case Intrinsic::amdgcn_global_load_monitor_b64:
case Intrinsic::amdgcn_global_load_tr_b64:
case Intrinsic::amdgcn_global_load_tr_b128:
case Intrinsic::amdgcn_global_load_tr4_b64:
@@ -1750,7 +1838,9 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
Ptr = II->getArgOperand(0);
break;
case Intrinsic::amdgcn_load_to_lds:
+ case Intrinsic::amdgcn_load_async_to_lds:
case Intrinsic::amdgcn_global_load_lds:
+ case Intrinsic::amdgcn_global_load_async_lds:
case Intrinsic::amdgcn_global_load_async_to_lds_b8:
case Intrinsic::amdgcn_global_load_async_to_lds_b32:
case Intrinsic::amdgcn_global_load_async_to_lds_b64:
@@ -1917,7 +2007,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
}
if (AS == AMDGPUAS::PRIVATE_ADDRESS)
- return Subtarget->enableFlatScratch()
+ return Subtarget->hasFlatScratchEnabled()
? isLegalFlatAddressingMode(AM, AMDGPUAS::PRIVATE_ADDRESS)
: isLegalMUBUFAddressingMode(AM);
@@ -1980,7 +2070,7 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
Align RequiredAlignment(
PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment.
- if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
+ if (Subtarget->hasLDSMisalignedBugInWGPMode() && Size > 32 &&
Alignment < RequiredAlignment)
return false;
@@ -2229,7 +2319,8 @@ bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
// TODO: This should be more aggressive, particular for 16-bit element
// vectors. However there are some mixed improvements and regressions.
EVT EltTy = VT.getVectorElementType();
- return EltTy.getSizeInBits() % 32 == 0;
+ unsigned MinAlign = Subtarget->useRealTrue16Insts() ? 16 : 32;
+ return EltTy.getSizeInBits() % MinAlign == 0;
}
bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
@@ -2251,6 +2342,14 @@ bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
return TargetLowering::isTypeDesirableForOp(Op, VT);
}
+MachinePointerInfo
+SITargetLowering::getKernargSegmentPtrInfo(MachineFunction &MF) const {
+ // This isn't really a constant pool but close enough.
+ MachinePointerInfo PtrInfo(MF.getPSVManager().getConstantPool());
+ PtrInfo.AddrSpace = AMDGPUAS::CONSTANT_ADDRESS;
+ return PtrInfo;
+}
+
SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
const SDLoc &SL,
SDValue Chain,
@@ -2313,9 +2412,16 @@ SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
}
- if (MemVT.isFloatingPoint())
- Val = getFPExtOrFPRound(DAG, Val, SL, VT);
- else if (Signed)
+ if (MemVT.isFloatingPoint()) {
+ if (VT.isFloatingPoint()) {
+ Val = getFPExtOrFPRound(DAG, Val, SL, VT);
+ } else {
+ assert(!MemVT.isVector());
+ EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
+ SDValue Cast = DAG.getBitcast(IntVT, Val);
+ Val = DAG.getAnyExtOrTrunc(Cast, SL, VT);
+ }
+ } else if (Signed)
Val = DAG.getSExtOrTrunc(Val, SL, VT);
else
Val = DAG.getZExtOrTrunc(Val, SL, VT);
@@ -2327,7 +2433,9 @@ SDValue SITargetLowering::lowerKernargMemParameter(
SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
uint64_t Offset, Align Alignment, bool Signed,
const ISD::InputArg *Arg) const {
- MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
+
+ MachinePointerInfo PtrInfo =
+ getKernargSegmentPtrInfo(DAG.getMachineFunction());
// Try to avoid using an extload by loading earlier than the argument address,
// and extracting the relevant bits. The load should hopefully be merged with
@@ -2342,7 +2450,8 @@ SDValue SITargetLowering::lowerKernargMemParameter(
// TODO: If we passed in the base kernel offset we could have a better
// alignment than 4, but we don't really need it.
SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
- SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
+ SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr,
+ PtrInfo.getWithOffset(AlignDownOffset), Align(4),
MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant);
@@ -2357,9 +2466,9 @@ SDValue SITargetLowering::lowerKernargMemParameter(
}
SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
- SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
- MachineMemOperand::MODereferenceable |
- MachineMemOperand::MOInvariant);
+ SDValue Load = DAG.getLoad(
+ MemVT, SL, Chain, Ptr, PtrInfo.getWithOffset(Offset), Alignment,
+ MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant);
SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
@@ -3023,7 +3132,7 @@ void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF,
CallingConv::ID CallConv,
bool IsShader) const {
bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
- if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
+ if (Subtarget->hasUserSGPRInit16BugInWave32() && !IsShader) {
// Note: user SGPRs are handled by the front-end for graphics shaders
// Pad up the used user SGPRs with dead inputs.
@@ -3092,7 +3201,7 @@ void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF,
CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
}
- assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
+ assert(!Subtarget->hasUserSGPRInit16BugInWave32() || IsShader ||
Info.getNumPreloadedSGPRs() >= 16);
}
@@ -3120,7 +3229,7 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
// the scratch registers to pass in.
bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
- if (!ST.enableFlatScratch()) {
+ if (!ST.hasFlatScratchEnabled()) {
if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
// If we have stack objects, we unquestionably need the private buffer
// resource. For the Code Object V2 ABI, this will be the first 4 user
@@ -3263,7 +3372,7 @@ SDValue SITargetLowering::LowerFormalArguments(
!Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
!Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
(void)UserSGPRInfo;
- if (!Subtarget->enableFlatScratch())
+ if (!Subtarget->hasFlatScratchEnabled())
assert(!UserSGPRInfo.hasFlatScratchInit());
if ((CallConv != CallingConv::AMDGPU_CS &&
CallConv != CallingConv::AMDGPU_Gfx &&
@@ -3334,7 +3443,7 @@ SDValue SITargetLowering::LowerFormalArguments(
allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
// FIXME: Sink this into allocateSpecialInputSGPRs
- if (!Subtarget->enableFlatScratch())
+ if (!Subtarget->hasFlatScratchEnabled())
CCInfo.AllocateReg(Info->getScratchRSrcReg());
allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
@@ -3559,11 +3668,17 @@ SDValue SITargetLowering::LowerFormalArguments(
if (IsEntryFunc)
allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
- // DAG.getPass() returns nullptr when using new pass manager.
- // TODO: Use DAG.getMFAM() to access analysis result.
if (DAG.getPass()) {
- auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
- ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
+ auto &ArgUsageInfo =
+ DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfoWrapperLegacy>();
+ ArgUsageInfo.getArgUsageInfo().setFuncArgInfo(Fn, Info->getArgInfo());
+ } else if (auto *MFAM = DAG.getMFAM()) {
+ Module &M = *MF.getFunction().getParent();
+ auto *ArgUsageInfo =
+ MFAM->getResult<ModuleAnalysisManagerMachineFunctionProxy>(MF)
+ .getCachedResult<AMDGPUArgumentUsageAnalysis>(M);
+ if (ArgUsageInfo)
+ ArgUsageInfo->setFuncArgInfo(Fn, Info->getArgInfo());
}
unsigned StackArgSize = CCInfo.getStackSize();
@@ -3778,12 +3893,19 @@ void SITargetLowering::passSpecialInputs(
const AMDGPUFunctionArgInfo *CalleeArgInfo =
&AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
- // DAG.getPass() returns nullptr when using new pass manager.
- // TODO: Use DAG.getMFAM() to access analysis result.
if (DAG.getPass()) {
auto &ArgUsageInfo =
- DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
- CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
+ DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfoWrapperLegacy>();
+ CalleeArgInfo =
+ &ArgUsageInfo.getArgUsageInfo().lookupFuncArgInfo(*CalleeFunc);
+ } else if (auto *MFAM = DAG.getMFAM()) {
+ Module &M = *DAG.getMachineFunction().getFunction().getParent();
+ auto *ArgUsageInfo =
+ MFAM->getResult<ModuleAnalysisManagerMachineFunctionProxy>(
+ DAG.getMachineFunction())
+ .getCachedResult<AMDGPUArgumentUsageAnalysis>(M);
+ if (ArgUsageInfo)
+ CalleeArgInfo = &ArgUsageInfo->lookupFuncArgInfo(*CalleeFunc);
}
}
@@ -4049,7 +4171,7 @@ bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
if (!CI->isTailCall())
return false;
- const Function *ParentFn = CI->getParent()->getParent();
+ const Function *ParentFn = CI->getFunction();
if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
return false;
return true;
@@ -4233,7 +4355,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
if (!IsSibCall || IsChainCallConv) {
- if (!Subtarget->enableFlatScratch()) {
+ if (!Subtarget->hasFlatScratchEnabled()) {
SmallVector<SDValue, 4> CopyFromChains;
// In the HSA case, this should be an identity copy.
@@ -5058,7 +5180,7 @@ emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
// Compare the just read M0 value to all possible Idx values.
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
.addReg(CurrentIdxReg)
- .addReg(Idx.getReg(), 0, Idx.getSubReg());
+ .addReg(Idx.getReg(), {}, Idx.getSubReg());
// Update EXEC, save the original EXEC value to VCC.
BuildMI(LoopBB, I, DL, TII->get(LMC.AndSaveExecOpc), NewExec)
@@ -5259,7 +5381,7 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
- .addReg(SrcReg, 0, SubReg)
+ .addReg(SrcReg, {}, SubReg)
.addReg(SrcReg, RegState::Implicit);
}
@@ -5293,7 +5415,7 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
.addImm(SubReg);
} else {
BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
- .addReg(SrcReg, 0, SubReg)
+ .addReg(SrcReg, {}, SubReg)
.addReg(SrcReg, RegState::Implicit);
}
@@ -5466,6 +5588,10 @@ static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
return std::numeric_limits<uint32_t>::min();
case AMDGPU::S_MAX_I32:
return std::numeric_limits<int32_t>::min();
+ case AMDGPU::V_ADD_F32_e64: // -0.0
+ return 0x80000000;
+ case AMDGPU::V_SUB_F32_e64: // +0.0
+ return 0x0;
case AMDGPU::S_ADD_I32:
case AMDGPU::S_SUB_I32:
case AMDGPU::S_OR_B32:
@@ -5473,6 +5599,9 @@ static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
return std::numeric_limits<uint32_t>::min();
case AMDGPU::S_AND_B32:
return std::numeric_limits<uint32_t>::max();
+ case AMDGPU::V_MIN_F32_e64:
+ case AMDGPU::V_MAX_F32_e64:
+ return 0x7fc00000; // qNAN
default:
llvm_unreachable(
"Unexpected opcode in getIdentityValueFor32BitWaveReduction");
@@ -5489,6 +5618,11 @@ static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc) {
return std::numeric_limits<uint64_t>::min();
case AMDGPU::V_CMP_GT_I64_e64: // max.i64
return std::numeric_limits<int64_t>::min();
+ case AMDGPU::V_MIN_F64_e64:
+ case AMDGPU::V_MAX_F64_e64:
+ case AMDGPU::V_MIN_NUM_F64_e64:
+ case AMDGPU::V_MAX_NUM_F64_e64:
+ return 0x7FF8000000000000; // qNAN
case AMDGPU::S_ADD_U64_PSEUDO:
case AMDGPU::S_SUB_U64_PSEUDO:
case AMDGPU::S_OR_B64:
@@ -5496,6 +5630,9 @@ static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc) {
return std::numeric_limits<uint64_t>::min();
case AMDGPU::S_AND_B64:
return std::numeric_limits<uint64_t>::max();
+ case AMDGPU::V_ADD_F64_e64:
+ case AMDGPU::V_ADD_F64_pseudo_e64:
+ return 0x8000000000000000; // -0.0
default:
llvm_unreachable(
"Unexpected opcode in getIdentityValueFor64BitWaveReduction");
@@ -5507,7 +5644,17 @@ static bool is32bitWaveReduceOperation(unsigned Opc) {
Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 ||
Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
- Opc == AMDGPU::S_XOR_B32;
+ Opc == AMDGPU::S_XOR_B32 || Opc == AMDGPU::V_MIN_F32_e64 ||
+ Opc == AMDGPU::V_MAX_F32_e64 || Opc == AMDGPU::V_ADD_F32_e64 ||
+ Opc == AMDGPU::V_SUB_F32_e64;
+}
+
+static bool isFloatingPointWaveReduceOperation(unsigned Opc) {
+ return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64 ||
+ Opc == AMDGPU::V_ADD_F32_e64 || Opc == AMDGPU::V_SUB_F32_e64 ||
+ Opc == AMDGPU::V_MIN_F64_e64 || Opc == AMDGPU::V_MAX_F64_e64 ||
+ Opc == AMDGPU::V_MIN_NUM_F64_e64 || Opc == AMDGPU::V_MAX_NUM_F64_e64 ||
+ Opc == AMDGPU::V_ADD_F64_e64 || Opc == AMDGPU::V_ADD_F64_pseudo_e64;
}
static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
@@ -5528,8 +5675,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
switch (Opc) {
case AMDGPU::S_MIN_U32:
case AMDGPU::S_MIN_I32:
+ case AMDGPU::V_MIN_F32_e64:
case AMDGPU::S_MAX_U32:
case AMDGPU::S_MAX_I32:
+ case AMDGPU::V_MAX_F32_e64:
case AMDGPU::S_AND_B32:
case AMDGPU::S_OR_B32: {
// Idempotent operations.
@@ -5541,6 +5690,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
case AMDGPU::V_CMP_LT_I64_e64: // min
case AMDGPU::V_CMP_GT_U64_e64: // umax
case AMDGPU::V_CMP_GT_I64_e64: // max
+ case AMDGPU::V_MIN_F64_e64:
+ case AMDGPU::V_MIN_NUM_F64_e64:
+ case AMDGPU::V_MAX_F64_e64:
+ case AMDGPU::V_MAX_NUM_F64_e64:
case AMDGPU::S_AND_B64:
case AMDGPU::S_OR_B64: {
// Idempotent operations.
@@ -5552,8 +5705,12 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
case AMDGPU::S_XOR_B64:
case AMDGPU::S_ADD_I32:
case AMDGPU::S_ADD_U64_PSEUDO:
+ case AMDGPU::V_ADD_F32_e64:
+ case AMDGPU::V_ADD_F64_e64:
+ case AMDGPU::V_ADD_F64_pseudo_e64:
case AMDGPU::S_SUB_I32:
- case AMDGPU::S_SUB_U64_PSEUDO: {
+ case AMDGPU::S_SUB_U64_PSEUDO:
+ case AMDGPU::V_SUB_F32_e64: {
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
@@ -5708,6 +5865,72 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
.addImm(AMDGPU::sub1);
break;
}
+ case AMDGPU::V_ADD_F32_e64:
+ case AMDGPU::V_ADD_F64_e64:
+ case AMDGPU::V_ADD_F64_pseudo_e64:
+ case AMDGPU::V_SUB_F32_e64: {
+ bool is32BitOpc = is32bitWaveReduceOperation(Opc);
+ const TargetRegisterClass *VregRC = TII->getRegClass(TII->get(Opc), 0);
+ Register ActiveLanesVreg = MRI.createVirtualRegister(VregRC);
+ Register DstVreg = MRI.createVirtualRegister(VregRC);
+ // Get number of active lanes as a float val.
+ BuildMI(BB, MI, DL,
+ TII->get(is32BitOpc ? AMDGPU::V_CVT_F32_I32_e64
+ : AMDGPU::V_CVT_F64_I32_e64),
+ ActiveLanesVreg)
+ .addReg(NewAccumulator->getOperand(0).getReg())
+ .addImm(0) // clamp
+ .addImm(0); // output-modifier
+
+ // Take negation of input for SUB reduction
+ unsigned srcMod =
+ (Opc == AMDGPU::V_SUB_F32_e64 ||
+ MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64)
+ ? SISrcMods::NEG
+ : SISrcMods::NONE;
+ unsigned MulOpc = is32BitOpc ? AMDGPU::V_MUL_F32_e64
+ : ST.getGeneration() >= AMDGPUSubtarget::GFX12
+ ? AMDGPU::V_MUL_F64_pseudo_e64
+ : AMDGPU::V_MUL_F64_e64;
+ auto DestVregInst = BuildMI(BB, MI, DL, TII->get(MulOpc),
+ DstVreg)
+ .addImm(srcMod) // src0 modifier
+ .addReg(SrcReg)
+ .addImm(SISrcMods::NONE) // src1 modifier
+ .addReg(ActiveLanesVreg)
+ .addImm(SISrcMods::NONE) // clamp
+ .addImm(SISrcMods::NONE); // output-mod
+ if (is32BitOpc) {
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
+ .addReg(DstVreg);
+ } else {
+ Register LaneValueLoReg =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register LaneValueHiReg =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ const TargetRegisterClass *VregSubRC =
+ TRI->getSubRegisterClass(VregRC, AMDGPU::sub0);
+ MachineOperand Op1L =
+ TII->buildExtractSubRegOrImm(MI, MRI, DestVregInst->getOperand(0),
+ VregRC, AMDGPU::sub0, VregSubRC);
+ MachineOperand Op1H =
+ TII->buildExtractSubRegOrImm(MI, MRI, DestVregInst->getOperand(0),
+ VregRC, AMDGPU::sub1, VregSubRC);
+ // lane value input should be in an sgpr
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+ LaneValueLoReg)
+ .add(Op1L);
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+ LaneValueHiReg)
+ .add(Op1H);
+ NewAccumulator =
+ BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
+ .addReg(LaneValueLoReg)
+ .addImm(AMDGPU::sub0)
+ .addReg(LaneValueHiReg)
+ .addImm(AMDGPU::sub1);
+ }
+ }
}
RetBB = &BB;
}
@@ -5725,6 +5948,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
MachineBasicBlock::iterator I = BB.end();
Register SrcReg = MI.getOperand(1).getReg();
bool is32BitOpc = is32bitWaveReduceOperation(Opc);
+ bool isFPOp = isFloatingPointWaveReduceOperation(Opc);
// Create Control flow for loop
// Split MI's Machine Basic block into For loop
@@ -5753,7 +5977,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
.addImm(IdentityValue);
} else {
- uint64_t IdentityValue = getIdentityValueFor64BitWaveReduction(Opc);
+ uint64_t IdentityValue =
+ MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
+ ? 0x0 // +0.0 for double sub reduction
+ : getIdentityValueFor64BitWaveReduction(Opc);
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
.addImm(IdentityValue);
}
@@ -5784,9 +6011,29 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
LaneValueReg)
.addReg(SrcReg)
.addReg(FF1Reg);
- NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
- .addReg(Accumulator->getOperand(0).getReg())
- .addReg(LaneValueReg);
+ if (isFPOp) {
+ Register LaneValVreg =
+ MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
+ Register DstVreg = MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
+ // Get the Lane Value in VGPR to avoid the Constant Bus Restriction
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_MOV_B32_e32),
+ LaneValVreg)
+ .addReg(LaneValueReg);
+ BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg)
+ .addImm(0) // src0 modifier
+ .addReg(Accumulator->getOperand(0).getReg())
+ .addImm(0) // src1 modifier
+ .addReg(LaneValVreg)
+ .addImm(0) // clamp
+ .addImm(0); // omod
+ NewAccumulator = BuildMI(*ComputeLoop, I, DL,
+ TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
+ .addReg(DstVreg);
+ } else {
+ NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
+ .addReg(Accumulator->getOperand(0).getReg())
+ .addReg(LaneValueReg);
+ }
} else {
Register LaneValueLoReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
@@ -5832,7 +6079,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
Register ComparisonResultReg =
MRI.createVirtualRegister(WaveMaskRegClass);
- const TargetRegisterClass *VregClass = TRI->getVGPR64Class();
+ int SrcIdx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src);
+ const TargetRegisterClass *VregClass =
+ TRI->getAllocatableClass(TII->getRegClass(MI.getDesc(), SrcIdx));
const TargetRegisterClass *VSubRegClass =
TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
@@ -5863,6 +6113,60 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
.addReg(Accumulator->getOperand(0).getReg());
break;
}
+ case AMDGPU::V_MIN_F64_e64:
+ case AMDGPU::V_MIN_NUM_F64_e64:
+ case AMDGPU::V_MAX_F64_e64:
+ case AMDGPU::V_MAX_NUM_F64_e64:
+ case AMDGPU::V_ADD_F64_e64:
+ case AMDGPU::V_ADD_F64_pseudo_e64: {
+ int SrcIdx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src);
+ const TargetRegisterClass *VregRC =
+ TRI->getAllocatableClass(TII->getRegClass(MI.getDesc(), SrcIdx));
+ const TargetRegisterClass *VregSubRC =
+ TRI->getSubRegisterClass(VregRC, AMDGPU::sub0);
+ Register AccumulatorVReg = MRI.createVirtualRegister(VregRC);
+ Register DstVreg = MRI.createVirtualRegister(VregRC);
+ Register LaneValLo =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register LaneValHi =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::COPY), AccumulatorVReg)
+ .addReg(Accumulator->getOperand(0).getReg());
+ unsigned Modifier =
+ MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
+ ? SISrcMods::NEG
+ : SISrcMods::NONE;
+ auto DstVregInst = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg)
+ .addImm(Modifier) // src0 modifiers
+ .addReg(LaneValue->getOperand(0).getReg())
+ .addImm(SISrcMods::NONE) // src1 modifiers
+ .addReg(AccumulatorVReg)
+ .addImm(SISrcMods::NONE) // clamp
+ .addImm(SISrcMods::NONE); // omod
+ auto ReadLaneLo =
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+ LaneValLo);
+ auto ReadLaneHi =
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+ LaneValHi);
+ MachineBasicBlock::iterator Iters = *ReadLaneLo;
+ MachineOperand Op1L =
+ TII->buildExtractSubRegOrImm(Iters, MRI, DstVregInst->getOperand(0),
+ VregRC, AMDGPU::sub0, VregSubRC);
+ MachineOperand Op1H =
+ TII->buildExtractSubRegOrImm(Iters, MRI, DstVregInst->getOperand(0),
+ VregRC, AMDGPU::sub1, VregSubRC);
+ ReadLaneLo.add(Op1L);
+ ReadLaneHi.add(Op1H);
+ NewAccumulator = BuildMI(*ComputeLoop, I, DL,
+ TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
+ .addReg(LaneValLo)
+ .addImm(AMDGPU::sub0)
+ .addReg(LaneValHi)
+ .addImm(AMDGPU::sub1);
+ break;
+ }
case AMDGPU::S_ADD_U64_PSEUDO:
case AMDGPU::S_SUB_U64_PSEUDO: {
NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
@@ -5918,6 +6222,13 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
+ case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MIN_F32_e64);
+ case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F64:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(),
+ ST.getGeneration() >= AMDGPUSubtarget::GFX12
+ ? AMDGPU::V_MIN_NUM_F64_e64
+ : AMDGPU::V_MIN_F64_e64);
case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
@@ -5926,14 +6237,37 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
+ case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MAX_F32_e64);
+ case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F64:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(),
+ ST.getGeneration() >= AMDGPUSubtarget::GFX12
+ ? AMDGPU::V_MAX_NUM_F64_e64
+ : AMDGPU::V_MAX_F64_e64);
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
+ case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_ADD_F32_e64);
+ case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F64:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(),
+ ST.getGeneration() >= AMDGPUSubtarget::GFX12
+ ? AMDGPU::V_ADD_F64_pseudo_e64
+ : AMDGPU::V_ADD_F64_e64);
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
+ case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_SUB_F32_e64);
+ case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64:
+ // There is no S/V_SUB_F64 opcode. Double type subtraction is expanded as
+ // fadd + neg, by setting the NEG bit in the instruction.
+ return lowerWaveReduce(MI, *BB, *getSubtarget(),
+ ST.getGeneration() >= AMDGPUSubtarget::GFX12
+ ? AMDGPU::V_ADD_F64_pseudo_e64
+ : AMDGPU::V_ADD_F64_e64);
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
@@ -6203,7 +6537,11 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
}
case AMDGPU::SI_INDIRECT_SRC_V1:
case AMDGPU::SI_INDIRECT_SRC_V2:
+ case AMDGPU::SI_INDIRECT_SRC_V3:
case AMDGPU::SI_INDIRECT_SRC_V4:
+ case AMDGPU::SI_INDIRECT_SRC_V5:
+ case AMDGPU::SI_INDIRECT_SRC_V6:
+ case AMDGPU::SI_INDIRECT_SRC_V7:
case AMDGPU::SI_INDIRECT_SRC_V8:
case AMDGPU::SI_INDIRECT_SRC_V9:
case AMDGPU::SI_INDIRECT_SRC_V10:
@@ -6214,7 +6552,11 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return emitIndirectSrc(MI, *BB, *getSubtarget());
case AMDGPU::SI_INDIRECT_DST_V1:
case AMDGPU::SI_INDIRECT_DST_V2:
+ case AMDGPU::SI_INDIRECT_DST_V3:
case AMDGPU::SI_INDIRECT_DST_V4:
+ case AMDGPU::SI_INDIRECT_DST_V5:
+ case AMDGPU::SI_INDIRECT_DST_V6:
+ case AMDGPU::SI_INDIRECT_DST_V7:
case AMDGPU::SI_INDIRECT_DST_V8:
case AMDGPU::SI_INDIRECT_DST_V9:
case AMDGPU::SI_INDIRECT_DST_V10:
@@ -6344,8 +6686,6 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case AMDGPU::DS_GWS_INIT:
case AMDGPU::DS_GWS_SEMA_BR:
case AMDGPU::DS_GWS_BARRIER:
- TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
- [[fallthrough]];
case AMDGPU::DS_GWS_SEMA_V:
case AMDGPU::DS_GWS_SEMA_P:
case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
@@ -6711,6 +7051,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return LowerBRCOND(Op, DAG);
case ISD::RETURNADDR:
return LowerRETURNADDR(Op, DAG);
+ case ISD::SPONENTRY:
+ return LowerSPONENTRY(Op, DAG);
case ISD::LOAD: {
SDValue Result = LowerLOAD(Op, DAG);
assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
@@ -6743,6 +7085,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
return LowerGlobalAddress(MFI, Op, DAG);
}
+ case ISD::ExternalSymbol:
+ return LowerExternalSymbol(Op, DAG);
case ISD::INTRINSIC_WO_CHAIN:
return LowerINTRINSIC_WO_CHAIN(Op, DAG);
case ISD::INTRINSIC_W_CHAIN:
@@ -6792,6 +7136,12 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return splitTernaryVectorOp(Op, DAG);
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT:
+ if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11 &&
+ Op.getValueType() == MVT::i16 &&
+ Op.getOperand(0).getValueType() == MVT::f32) {
+ // Make f32->i16 legal so we can select V_CVT_PK_[IU]16_F32.
+ return Op;
+ }
return LowerFP_TO_INT(Op, DAG);
case ISD::SHL:
case ISD::SRA:
@@ -7032,9 +7382,15 @@ static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N,
SDLoc SL(N);
if (Src.getOpcode() == ISD::SETCC) {
+ SDValue Op0 = Src.getOperand(0);
+ SDValue Op1 = Src.getOperand(1);
+ // Need to expand bfloat to float for comparison (setcc).
+ if (Op0.getValueType() == MVT::bf16) {
+ Op0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op0);
+ Op1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op1);
+ }
// (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
- return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
- Src.getOperand(1), Src.getOperand(2));
+ return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Op0, Op1, Src.getOperand(2));
}
if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
// (ballot 0) -> 0
@@ -7260,6 +7616,84 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
return DAG.getBitcast(VT, UnrolledLaneOp);
}
+static SDValue lowerWaveShuffle(const SITargetLowering &TLI, SDNode *N,
+ SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+
+ if (VT.getSizeInBits() != 32)
+ return SDValue();
+
+ SDLoc SL(N);
+
+ SDValue Value = N->getOperand(1);
+ SDValue Index = N->getOperand(2);
+
+ // ds_bpermute requires index to be multiplied by 4
+ SDValue ShiftAmount = DAG.getShiftAmountConstant(2, MVT::i32, SL);
+ SDValue ShiftedIndex =
+ DAG.getNode(ISD::SHL, SL, Index.getValueType(), Index, ShiftAmount);
+
+ // Intrinsics will require i32 to operate on
+ SDValue ValueI32 = DAG.getBitcast(MVT::i32, Value);
+
+ auto MakeIntrinsic = [&DAG, &SL](unsigned IID, MVT RetVT,
+ SmallVector<SDValue> IntrinArgs) -> SDValue {
+ SmallVector<SDValue> Operands(1);
+ Operands[0] = DAG.getTargetConstant(IID, SL, MVT::i32);
+ Operands.append(IntrinArgs);
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, RetVT, Operands);
+ };
+
+ // If we can bpermute across the whole wave, then just do that
+ if (TLI.getSubtarget()->supportsWaveWideBPermute()) {
+ SDValue BPermute = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
+ {ShiftedIndex, ValueI32});
+ return DAG.getBitcast(VT, BPermute);
+ }
+
+ assert(TLI.getSubtarget()->isWave64());
+
+ // Otherwise, we need to make use of whole wave mode
+ SDValue PoisonVal = DAG.getPOISON(ValueI32->getValueType(0));
+
+ // Set inactive lanes to poison
+ SDValue WWMValue = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
+ {ValueI32, PoisonVal});
+ SDValue WWMIndex = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
+ {ShiftedIndex, PoisonVal});
+
+ SDValue Swapped =
+ MakeIntrinsic(Intrinsic::amdgcn_permlane64, MVT::i32, {WWMValue});
+
+ // Get permutation of each half, then we'll select which one to use
+ SDValue BPermSameHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
+ {WWMIndex, WWMValue});
+ SDValue BPermOtherHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
+ MVT::i32, {WWMIndex, Swapped});
+ SDValue BPermOtherHalfWWM =
+ MakeIntrinsic(Intrinsic::amdgcn_wwm, MVT::i32, {BPermOtherHalf});
+
+ // Select which side to take the permute from
+ SDValue ThreadIDMask = DAG.getAllOnesConstant(SL, MVT::i32);
+ // We can get away with only using mbcnt_lo here since we're only
+ // trying to detect which side of 32 each lane is on, and mbcnt_lo
+ // returns 32 for lanes 32-63.
+ SDValue ThreadID =
+ MakeIntrinsic(Intrinsic::amdgcn_mbcnt_lo, MVT::i32,
+ {ThreadIDMask, DAG.getTargetConstant(0, SL, MVT::i32)});
+
+ SDValue SameOrOtherHalf =
+ DAG.getNode(ISD::AND, SL, MVT::i32,
+ DAG.getNode(ISD::XOR, SL, MVT::i32, ThreadID, Index),
+ DAG.getTargetConstant(32, SL, MVT::i32));
+ SDValue UseSameHalf =
+ DAG.getSetCC(SL, MVT::i1, SameOrOtherHalf,
+ DAG.getConstant(0, SL, MVT::i32), ISD::SETEQ);
+ SDValue Result = DAG.getSelect(SL, MVT::i32, UseSameHalf, BPermSameHalf,
+ BPermOtherHalfWWM);
+ return DAG.getBitcast(VT, Result);
+}
+
void SITargetLowering::ReplaceNodeResults(SDNode *N,
SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const {
@@ -7632,6 +8066,20 @@ SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
}
+SDValue SITargetLowering::LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+ // For functions that set up their own stack, select the GET_STACK_BASE
+ // pseudo.
+ if (MFI->isBottomOfStack())
+ return Op;
+
+ // For everything else, create a dummy stack object.
+ int FI = MF.getFrameInfo().CreateFixedObject(1, 0, /*IsImmutable=*/false);
+ return DAG.getFrameIndex(FI, Op.getValueType());
+}
+
SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
const SDLoc &DL, EVT VT) const {
return Op.getValueType().bitsLE(VT)
@@ -7701,8 +8149,7 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
// Round-inexact-to-odd f64 to f32, then do the final rounding using the
// hardware f32 -> bf16 instruction.
- EVT F32VT = SrcVT.isVector() ? SrcVT.changeVectorElementType(MVT::f32) :
- MVT::f32;
+ EVT F32VT = SrcVT.changeElementType(*DAG.getContext(), MVT::f32);
SDValue Rod = expandRoundInexactToOdd(F32VT, Src, DL, DAG);
return DAG.getNode(ISD::FP_ROUND, DL, DstVT, Rod,
DAG.getTargetConstant(0, DL, MVT::i32));
@@ -7849,14 +8296,13 @@ SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
: Op->getOperand(0).getValueType();
- auto ExtTy = OpTy.changeElementType(MVT::i32);
+ auto &DAG = DCI.DAG;
+ auto ExtTy = OpTy.changeElementType(*DAG.getContext(), MVT::i32);
if (DCI.isBeforeLegalizeOps() ||
isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
return SDValue();
- auto &DAG = DCI.DAG;
-
SDLoc DL(Op);
SDValue LHS;
SDValue RHS;
@@ -8033,7 +8479,7 @@ SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
}
SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
- if (!Subtarget->isTrapHandlerEnabled() ||
+ if (!Subtarget->hasTrapHandler() ||
Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
return lowerTrapEndpgm(Op, DAG);
@@ -8054,10 +8500,11 @@ SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
MachineFunction &MF = DAG.getMachineFunction();
uint64_t Offset = getImplicitParameterOffset(MF, Param);
SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
- MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
- return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
- MachineMemOperand::MODereferenceable |
- MachineMemOperand::MOInvariant);
+ MachinePointerInfo PtrInfo =
+ getKernargSegmentPtrInfo(DAG.getMachineFunction());
+ return DAG.getLoad(
+ VT, DL, DAG.getEntryNode(), Ptr, PtrInfo.getWithOffset(Offset), Alignment,
+ MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant);
}
SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
@@ -8115,7 +8562,7 @@ SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
SDValue Chain = Op.getOperand(0);
MachineFunction &MF = DAG.getMachineFunction();
- if (!Subtarget->isTrapHandlerEnabled() ||
+ if (!Subtarget->hasTrapHandler() ||
Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
LLVMContext &Ctx = MF.getFunction().getContext();
Ctx.diagnose(DiagnosticInfoUnsupported(MF.getFunction(),
@@ -8319,6 +8766,9 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
Op.getValueType() == MVT::i64) {
const SIMachineFunctionInfo *Info =
DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
+ if (Info->get32BitAddressHighBits() == 0)
+ return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, Src);
+
SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
@@ -8847,17 +9297,17 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
GV->hasExternalLinkage()) {
- Type *Ty = GV->getValueType();
+ const GlobalVariable &GVar = *cast<GlobalVariable>(GV);
// HIP uses an unsized array `extern __shared__ T s[]` or similar
// zero-sized type in other languages to declare the dynamic shared
// memory which size is not known at the compile time. They will be
// allocated by the runtime and placed directly after the static
// allocated ones. They all share the same offset.
- if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
+ if (GVar.getGlobalSize(GVar.getDataLayout()) == 0) {
assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
// Adjust alignment for that dynamic shared memory array.
Function &F = DAG.getMachineFunction().getFunction();
- MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV));
+ MFI->setDynLDSAlign(F, GVar);
MFI->setUsesDynamicLDS(true);
return SDValue(
DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
@@ -8912,6 +9362,15 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
MachineMemOperand::MOInvariant);
}
+SDValue SITargetLowering::LowerExternalSymbol(SDValue Op,
+ SelectionDAG &DAG) const {
+ // TODO: Handle this. It should be mostly the same as LowerGlobalAddress.
+ const Function &Fn = DAG.getMachineFunction().getFunction();
+ DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
+ Fn, "unsupported external symbol", Op.getDebugLoc()));
+ return DAG.getPOISON(Op.getValueType());
+}
+
SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
const SDLoc &DL, SDValue V) const {
// We can't use S_MOV_B32 directly, because there is no way to specify m0 as
@@ -9131,16 +9590,23 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
SDLoc DL(Op);
MachineFunction &MF = DAG.getMachineFunction();
const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
+ unsigned IntrOpcode = Intr->BaseOpcode;
+ // For image atomic: use no-return opcode if result is unused.
+ if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode &&
+ !Op.getNode()->hasAnyUseOfValue(0))
+ IntrOpcode = Intr->AtomicNoRetBaseOpcode;
const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
- AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
+ AMDGPU::getMIMGBaseOpcodeInfo(IntrOpcode);
const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
- unsigned IntrOpcode = Intr->BaseOpcode;
bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
SmallVector<EVT, 3> ResultTypes(Op->values());
SmallVector<EVT, 3> OrigResultTypes(Op->values());
+ if (BaseOpcode->NoReturn && BaseOpcode->Atomic)
+ ResultTypes.erase(&ResultTypes[0]);
+
bool IsD16 = false;
bool IsG16 = false;
bool IsA16 = false;
@@ -9159,8 +9625,10 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
VData = Op.getOperand(2);
IsAtomicPacked16Bit =
- (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
- Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
+ (IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
+ IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN ||
+ IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 ||
+ IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN);
bool Is64Bit = VData.getValueSizeInBits() == 64;
if (BaseOpcode->AtomicX2) {
@@ -9170,7 +9638,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
if (Is64Bit)
VData = DAG.getBitcast(MVT::v4i32, VData);
- ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
+ if (!BaseOpcode->NoReturn)
+ ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
+
DMask = Is64Bit ? 0xf : 0x3;
NumVDataDwords = Is64Bit ? 4 : 2;
} else {
@@ -9396,8 +9866,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
}
unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
- if (BaseOpcode->Atomic)
- CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
+ // Keep GLC only when the atomic's result is actually used.
+ if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
+ CPol |= AMDGPU::CPol::GLC;
if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
AMDGPU::CPol::VOLATILE))
return Op;
@@ -9509,13 +9980,20 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
DAG.setNodeMemRefs(NewNode, {MemRef});
}
+ if (BaseOpcode->NoReturn) {
+ if (BaseOpcode->Atomic)
+ return DAG.getMergeValues(
+ {DAG.getPOISON(OrigResultTypes[0]), SDValue(NewNode, 0)}, DL);
+
+ return SDValue(NewNode, 0);
+ }
+
if (BaseOpcode->AtomicX2) {
SmallVector<SDValue, 1> Elt;
DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
}
- if (BaseOpcode->NoReturn)
- return SDValue(NewNode, 0);
+
return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
NumVDataDwords, IsAtomicPacked16Bit, DL);
@@ -9709,7 +10187,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
}
case Intrinsic::amdgcn_kernarg_segment_ptr: {
- if (!AMDGPU::isKernel(MF.getFunction().getCallingConv())) {
+ if (!AMDGPU::isKernel(MF.getFunction())) {
// This only makes sense to call in a kernel, so just lower to null.
return DAG.getConstant(0, DL, VT);
}
@@ -10110,11 +10588,13 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SDLoc SL(Op);
auto IndexKey = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, IndexKeyTy);
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
- {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
- Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
- IndexKey, Op.getOperand(7),
- Op.getOperand(8)}); // No clamp operand
+ SmallVector<SDValue> Args{
+ Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
+ Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
+ IndexKey, Op.getOperand(7), Op.getOperand(8)};
+ if (IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8)
+ Args.push_back(Op.getOperand(9));
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(), Args);
}
case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
@@ -10148,6 +10628,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
Poisons.push_back(DAG.getPOISON(ValTy));
return DAG.getMergeValues(Poisons, SDLoc(Op));
}
+ case Intrinsic::amdgcn_wave_shuffle:
+ return lowerWaveShuffle(*this, Op.getNode(), DAG);
default:
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
@@ -10455,9 +10937,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_raw_buffer_atomic_dec:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
- case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
- return lowerRawBufferAtomicIntrin(Op, DAG,
- AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
case Intrinsic::amdgcn_struct_buffer_atomic_swap:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
return lowerStructBufferAtomicIntrin(Op, DAG,
@@ -10499,10 +10978,21 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_struct_buffer_atomic_dec:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
+ case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
+ return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_CSUB);
+ case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
+ return lowerStructBufferAtomicIntrin(Op, DAG,
+ AMDGPUISD::BUFFER_ATOMIC_CSUB);
+ case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
+ return lowerRawBufferAtomicIntrin(Op, DAG,
+ AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
return lowerStructBufferAtomicIntrin(Op, DAG,
AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
-
case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
@@ -10752,6 +11242,19 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
Ops, M->getMemOperand());
}
+ case Intrinsic::amdgcn_s_alloc_vgpr: {
+ SDValue NumVGPRs = Op.getOperand(2);
+ if (!NumVGPRs->isDivergent())
+ return Op;
+
+ SDValue ReadFirstLaneID =
+ DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
+ NumVGPRs = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
+ ReadFirstLaneID, NumVGPRs);
+
+ return DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, Op->getVTList(),
+ Op.getOperand(0), Op.getOperand(1), NumVGPRs);
+ }
case Intrinsic::amdgcn_s_get_barrier_state:
case Intrinsic::amdgcn_s_get_named_barrier_state: {
SDValue Chain = Op->getOperand(0);
@@ -10794,6 +11297,26 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
return DAG.getAtomicLoad(ISD::NON_EXTLOAD, DL, MII->getMemoryVT(), VT,
Chain, Ptr, MII->getMemOperand());
}
+ case Intrinsic::amdgcn_flat_load_monitor_b32:
+ case Intrinsic::amdgcn_flat_load_monitor_b64:
+ case Intrinsic::amdgcn_flat_load_monitor_b128: {
+ MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
+ SDValue Chain = Op->getOperand(0);
+ SDValue Ptr = Op->getOperand(2);
+ return DAG.getMemIntrinsicNode(AMDGPUISD::FLAT_LOAD_MONITOR, DL,
+ Op->getVTList(), {Chain, Ptr},
+ MII->getMemoryVT(), MII->getMemOperand());
+ }
+ case Intrinsic::amdgcn_global_load_monitor_b32:
+ case Intrinsic::amdgcn_global_load_monitor_b64:
+ case Intrinsic::amdgcn_global_load_monitor_b128: {
+ MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
+ SDValue Chain = Op->getOperand(0);
+ SDValue Ptr = Op->getOperand(2);
+ return DAG.getMemIntrinsicNode(AMDGPUISD::GLOBAL_LOAD_MONITOR, DL,
+ Op->getVTList(), {Chain, Ptr},
+ MII->getMemoryVT(), MII->getMemOperand());
+ }
default:
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
@@ -10932,12 +11455,24 @@ SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
return VData;
}
+static bool isAsyncLDSDMA(Intrinsic::ID Intr) {
+ switch (Intr) {
+ case Intrinsic::amdgcn_raw_buffer_load_async_lds:
+ case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
+ case Intrinsic::amdgcn_struct_buffer_load_async_lds:
+ case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
+ case Intrinsic::amdgcn_load_async_to_lds:
+ case Intrinsic::amdgcn_global_load_async_lds:
+ return true;
+ }
+ return false;
+}
+
SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
SDValue Chain = Op.getOperand(0);
unsigned IntrinsicID = Op.getConstantOperandVal(1);
- MachineFunction &MF = DAG.getMachineFunction();
switch (IntrinsicID) {
case Intrinsic::amdgcn_exp_compr: {
@@ -11128,15 +11663,21 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
M->getMemoryVT(), M->getMemOperand());
}
case Intrinsic::amdgcn_raw_buffer_load_lds:
+ case Intrinsic::amdgcn_raw_buffer_load_async_lds:
case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
+ case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
case Intrinsic::amdgcn_struct_buffer_load_lds:
- case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
+ case Intrinsic::amdgcn_struct_buffer_load_async_lds:
+ case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
+ case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
if (!Subtarget->hasVMemToLDSLoad())
return SDValue();
unsigned Opc;
bool HasVIndex =
IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
- IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
+ IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_async_lds ||
+ IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds ||
+ IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds;
unsigned OpOffset = HasVIndex ? 1 : 0;
SDValue VOffset = Op.getOperand(5 + OpOffset);
bool HasVOffset = !isNullConstant(VOffset);
@@ -11208,33 +11749,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
? 1
: 0,
DL, MVT::i8)); // swz
+ Ops.push_back(
+ DAG.getTargetConstant(isAsyncLDSDMA(IntrinsicID), DL, MVT::i8));
Ops.push_back(M0Val.getValue(0)); // Chain
Ops.push_back(M0Val.getValue(1)); // Glue
auto *M = cast<MemSDNode>(Op);
- MachineMemOperand *LoadMMO = M->getMemOperand();
- // Don't set the offset value here because the pointer points to the base of
- // the buffer.
- MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
-
- MachinePointerInfo StorePtrI = LoadPtrI;
- LoadPtrI.V = PoisonValue::get(
- PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
- LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
- StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
-
- auto F = LoadMMO->getFlags() &
- ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
- LoadMMO =
- MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, Size,
- LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
-
- MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
- StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
- LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
-
auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
- DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
+ DAG.setNodeMemRefs(Load, M->memoperands());
return SDValue(Load, 0);
}
@@ -11242,7 +11764,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
// for "trust me" that the remaining cases are global pointers until
// such time as we can put two mem operands on an intrinsic.
case Intrinsic::amdgcn_load_to_lds:
- case Intrinsic::amdgcn_global_load_lds: {
+ case Intrinsic::amdgcn_load_async_to_lds:
+ case Intrinsic::amdgcn_global_load_lds:
+ case Intrinsic::amdgcn_global_load_async_lds: {
if (!Subtarget->hasVMemToLDSLoad())
return SDValue();
@@ -11307,30 +11831,19 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
}
Ops.push_back(Op.getOperand(5)); // Offset
- Ops.push_back(Op.getOperand(6)); // CPol
+
+ unsigned Aux = Op.getConstantOperandVal(6);
+ Ops.push_back(DAG.getTargetConstant(Aux & ~AMDGPU::CPol::VIRTUAL_BITS, DL,
+ MVT::i32)); // CPol
+ Ops.push_back(
+ DAG.getTargetConstant(isAsyncLDSDMA(IntrinsicID), DL, MVT::i8));
+
Ops.push_back(M0Val.getValue(0)); // Chain
Ops.push_back(M0Val.getValue(1)); // Glue
auto *M = cast<MemSDNode>(Op);
- MachineMemOperand *LoadMMO = M->getMemOperand();
- MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
- LoadPtrI.Offset = Op->getConstantOperandVal(5);
- MachinePointerInfo StorePtrI = LoadPtrI;
- LoadPtrI.V = PoisonValue::get(
- PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
- LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
- StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
- auto F = LoadMMO->getFlags() &
- ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
- LoadMMO =
- MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, Size,
- LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
- MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
- StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
- LoadMMO->getAAInfo());
-
auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
- DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
+ DAG.setNodeMemRefs(Load, M->memoperands());
return SDValue(Load, 0);
}
@@ -11375,6 +11888,11 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
return SDValue(NewMI, 0);
}
+ case Intrinsic::amdgcn_s_wakeup_barrier: {
+ if (!Subtarget->hasSWakeupBarrier())
+ return SDValue();
+ [[fallthrough]];
+ }
case Intrinsic::amdgcn_s_barrier_join: {
// these three intrinsics have one operand: barrier pointer
SDValue Chain = Op->getOperand(0);
@@ -11384,16 +11902,32 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
if (isa<ConstantSDNode>(BarOp)) {
uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
- Opc = AMDGPU::S_BARRIER_JOIN_IMM;
-
+ switch (IntrinsicID) {
+ default:
+ return SDValue();
+ case Intrinsic::amdgcn_s_barrier_join:
+ Opc = AMDGPU::S_BARRIER_JOIN_IMM;
+ break;
+ case Intrinsic::amdgcn_s_wakeup_barrier:
+ Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
+ break;
+ }
// extract the BarrierID from bits 4-9 of the immediate
unsigned BarID = (BarVal >> 4) & 0x3F;
SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
Ops.push_back(K);
Ops.push_back(Chain);
} else {
- Opc = AMDGPU::S_BARRIER_JOIN_M0;
-
+ switch (IntrinsicID) {
+ default:
+ return SDValue();
+ case Intrinsic::amdgcn_s_barrier_join:
+ Opc = AMDGPU::S_BARRIER_JOIN_M0;
+ break;
+ case Intrinsic::amdgcn_s_wakeup_barrier:
+ Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
+ break;
+ }
// extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
SDValue M0Val;
M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
@@ -11482,7 +12016,7 @@ SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
// On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
// being added, so we can only safely match a 32-bit addition with no
// unsigned overflow.
- bool CheckNUW = AMDGPU::isGFX1250(*Subtarget);
+ bool CheckNUW = Subtarget->hasGFX1250Insts();
if (!CheckNUW || isNoUnsignedWrap(N0)) {
C1 = cast<ConstantSDNode>(N0.getOperand(1));
N0 = N0.getOperand(0);
@@ -11542,11 +12076,15 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
}
}
if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
+ // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
+ // being added, so we can only safely match a 32-bit addition with no
+ // unsigned overflow.
+ bool CheckNUW = Subtarget->hasGFX1250Insts();
SDValue N0 = CombinedOffset.getOperand(0);
SDValue N1 = CombinedOffset.getOperand(1);
uint32_t SOffset, ImmOffset;
int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
- if (Offset >= 0 &&
+ if (Offset >= 0 && (!CheckNUW || isNoUnsignedWrap(CombinedOffset)) &&
TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
Offsets[0] = N0;
Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
@@ -11845,7 +12383,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
Align Alignment = Load->getAlign();
unsigned AS = Load->getAddressSpace();
- if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
+ if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
+ AS == AMDGPUAS::FLAT_ADDRESS &&
Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
return SplitVectorLoad(Op, DAG);
}
@@ -11866,7 +12405,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
(AS == AMDGPUAS::GLOBAL_ADDRESS &&
Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
- isMemOpHasNoClobberedMemOperand(Load))) {
+ (Load->isInvariant() || isMemOpHasNoClobberedMemOperand(Load)))) {
if ((!Op->isDivergent() || AMDGPU::isUniformMMO(MMO)) &&
Alignment >= Align(4) && NumElements < 32) {
if (MemVT.isPow2VectorType() ||
@@ -12161,7 +12700,10 @@ SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
SDValue LHS = Op.getOperand(1);
SDValue RHS = Op.getOperand(2);
- SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
+ // TODO: The combiner should probably handle elimination of redundant fabs.
+ SDValue r1 = DAG.SignBitIsZeroFP(RHS)
+ ? RHS
+ : DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
const APFloat K0Val(0x1p+96f);
const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
@@ -12466,7 +13008,8 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
Store->getValue().getValueType().getScalarType() == MVT::i32);
unsigned AS = Store->getAddressSpace();
- if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
+ if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
+ AS == AMDGPUAS::FLAT_ADDRESS &&
Store->getAlign().value() < VT.getStoreSize() &&
VT.getSizeInBits() > 32) {
return SplitVectorStore(Op, DAG);
@@ -12506,7 +13049,7 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
return SDValue();
case 16:
if (NumElements > 4 ||
- (NumElements == 3 && !Subtarget->enableFlatScratch()))
+ (NumElements == 3 && !Subtarget->hasFlatScratchEnabled()))
return SplitVectorStore(Op, DAG);
return SDValue();
default:
@@ -12728,23 +13271,36 @@ SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
// if Arg is already the result of a multiply by constant.
auto Flags = Op->getFlags();
+ // AMDGPUISD nodes of vector type must be unrolled here since
+ // they will not be expanded elsewhere.
+ auto UnrollIfVec = [&DAG](SDValue V) -> SDValue {
+ if (!V.getValueType().isVector())
+ return V;
+
+ return DAG.UnrollVectorOp(cast<SDNode>(V));
+ };
+
SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
if (Subtarget->hasTrigReducedRange()) {
SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
- TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
+ TrigVal = UnrollIfVec(DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags));
} else {
TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
}
switch (Op.getOpcode()) {
case ISD::FCOS:
- return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
+ TrigVal = DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
+ break;
case ISD::FSIN:
- return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
+ TrigVal = DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
+ break;
default:
llvm_unreachable("Wrong trig opcode");
}
+
+ return UnrollIfVec(TrigVal);
}
SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
@@ -13398,6 +13954,7 @@ calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
}
+ case ISD::ANY_EXTEND:
case ISD::SIGN_EXTEND:
case ISD::ZERO_EXTEND:
case ISD::SIGN_EXTEND_INREG: {
@@ -13904,6 +14461,12 @@ static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
assert(OtherOp.getValueSizeInBits() == 32);
}
+ // Check that we haven't just recreated the same FSHR node.
+ if (N->getOpcode() == ISD::FSHR &&
+ (N->getOperand(0) == Op || N->getOperand(0) == OtherOp) &&
+ (N->getOperand(1) == Op || N->getOperand(1) == OtherOp))
+ return SDValue();
+
if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
assert(Op.getValueType().isByteSized() &&
@@ -14181,10 +14744,11 @@ SDValue SITargetLowering::performXorCombine(SDNode *N,
return SDValue();
}
-SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
- DAGCombinerInfo &DCI) const {
+SDValue
+SITargetLowering::performZeroOrAnyExtendCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
if (!Subtarget->has16BitInsts() ||
- DCI.getDAGCombineLevel() < AfterLegalizeDAG)
+ DCI.getDAGCombineLevel() < AfterLegalizeTypes)
return SDValue();
EVT VT = N->getValueType(0);
@@ -14195,7 +14759,44 @@ SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
if (Src.getValueType() != MVT::i16)
return SDValue();
- return SDValue();
+ if (!Src->hasOneUse())
+ return SDValue();
+
+ // TODO: We bail out below if SrcOffset is not in the first dword (>= 4). It's
+ // possible we're missing out on some combine opportunities, but we'd need to
+ // weigh the cost of extracting the byte from the upper dwords.
+
+ std::optional<ByteProvider<SDValue>> BP0 =
+ calculateByteProvider(SDValue(N, 0), 0, 0, 0);
+ if (!BP0 || BP0->SrcOffset >= 4 || !BP0->Src)
+ return SDValue();
+ SDValue V0 = *BP0->Src;
+
+ std::optional<ByteProvider<SDValue>> BP1 =
+ calculateByteProvider(SDValue(N, 0), 1, 0, 1);
+ if (!BP1 || BP1->SrcOffset >= 4 || !BP1->Src)
+ return SDValue();
+
+ SDValue V1 = *BP1->Src;
+
+ if (V0 == V1)
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc DL(N);
+ uint32_t PermMask = 0x0c0c0c0c;
+ if (V0) {
+ V0 = DAG.getBitcastedAnyExtOrTrunc(V0, DL, MVT::i32);
+ PermMask = (PermMask & ~0xFF) | (BP0->SrcOffset + 4);
+ }
+
+ if (V1) {
+ V1 = DAG.getBitcastedAnyExtOrTrunc(V1, DL, MVT::i32);
+ PermMask = (PermMask & ~(0xFF << 8)) | (BP1->SrcOffset << 8);
+ }
+
+ return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, V0, V1,
+ DAG.getConstant(PermMask, DL, MVT::i32));
}
SDValue
@@ -14299,6 +14900,7 @@ SDValue SITargetLowering::performRcpCombine(SDNode *N,
}
bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
+ SDNodeFlags UserFlags,
unsigned MaxDepth) const {
unsigned Opcode = Op.getOpcode();
if (Opcode == ISD::FCANONICALIZE)
@@ -14498,7 +15100,7 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
// FIXME: denormalsEnabledForType is broken for dynamic
return denormalsEnabledForType(DAG, Op.getValueType()) &&
- DAG.isKnownNeverSNaN(Op);
+ (UserFlags.hasNoNaNs() || DAG.isKnownNeverSNaN(Op));
}
bool SITargetLowering::isCanonicalized(Register Reg, const MachineFunction &MF,
@@ -14993,8 +15595,9 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
// for some types, but at a higher cost since it's implemented with a 3
// operand form.
const SDNodeFlags Flags = N->getFlags();
- if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) &&
- !Subtarget->hasIEEEMinimumMaximumInsts() && Flags.hasNoNaNs()) {
+ if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) && Flags.hasNoNaNs() &&
+ !Subtarget->hasIEEEMinimumMaximumInsts() &&
+ isOperationLegal(ISD::FMINNUM_IEEE, VT.getScalarType())) {
unsigned NewOpc =
Opc == ISD::FMINIMUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
return DAG.getNode(NewOpc, SDLoc(N), VT, Op0, Op1, Flags);
@@ -16335,7 +16938,9 @@ SDValue SITargetLowering::performFDivCombine(SDNode *N,
SelectionDAG &DAG = DCI.DAG;
SDLoc SL(N);
EVT VT = N->getValueType(0);
- if ((VT != MVT::f16 && VT != MVT::bf16) || !Subtarget->has16BitInsts())
+
+ // fsqrt legality correlates to rsq availability.
+ if ((VT != MVT::f16 && VT != MVT::bf16) || !isOperationLegal(ISD::FSQRT, VT))
return SDValue();
SDValue LHS = N->getOperand(0);
@@ -16370,7 +16975,7 @@ SDValue SITargetLowering::performFMulCombine(SDNode *N,
SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
EVT ScalarVT = VT.getScalarType();
- EVT IntVT = VT.changeElementType(MVT::i32);
+ EVT IntVT = VT.changeElementType(*DAG.getContext(), MVT::i32);
if (!N->isDivergent() && getSubtarget()->hasSALUFloatInsts() &&
(ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
@@ -16548,7 +17153,6 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N,
LHS.getOpcode() == ISD::SELECT &&
isa<ConstantSDNode>(LHS.getOperand(1)) &&
isa<ConstantSDNode>(LHS.getOperand(2)) &&
- LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
isBoolSGPR(LHS.getOperand(0))) {
// Given CT != FT:
// setcc (select cc, CT, CF), CF, eq => xor cc, -1
@@ -16558,13 +17162,34 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N,
const APInt &CT = LHS.getConstantOperandAPInt(1);
const APInt &CF = LHS.getConstantOperandAPInt(2);
- if ((CF == CRHSVal && CC == ISD::SETEQ) ||
- (CT == CRHSVal && CC == ISD::SETNE))
- return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
- DAG.getAllOnesConstant(SL, MVT::i1));
- if ((CF == CRHSVal && CC == ISD::SETNE) ||
- (CT == CRHSVal && CC == ISD::SETEQ))
- return LHS.getOperand(0);
+ if (CT != CF) {
+ if ((CF == CRHSVal && CC == ISD::SETEQ) ||
+ (CT == CRHSVal && CC == ISD::SETNE))
+ return DAG.getNOT(SL, LHS.getOperand(0), MVT::i1);
+ if ((CF == CRHSVal && CC == ISD::SETNE) ||
+ (CT == CRHSVal && CC == ISD::SETEQ))
+ return LHS.getOperand(0);
+ }
+ }
+
+ // setcc v.64, 0xXXXX'XXXX'0000'0000, lt/ge
+ // => setcc v.hi32, 0xXXXX'XXXX, lt/ge
+ //
+ // setcc v.64, 0xXXXX'XXXX'FFFF'FFFF, le/gt
+ // => setcc v.hi32, 0xXXXX'XXXX, le/gt
+ if (VT == MVT::i64) {
+ const uint64_t Mask32 = maskTrailingOnes<uint64_t>(32);
+ const uint64_t CRHSInt = CRHSVal.getZExtValue();
+
+ if ( // setcc v.64, 0xXXXX'XXXX'0000'0000, lt/ge
+ ((CRHSInt & Mask32) == 0 && (CC == ISD::SETULT || CC == ISD::SETUGE ||
+ CC == ISD::SETLT || CC == ISD::SETGE)) ||
+ // setcc v.64, 0xXXXX'XXXX'FFFF'FFFF, le/gt
+ ((CRHSInt & Mask32) == Mask32 &&
+ (CC == ISD::SETULE || CC == ISD::SETUGT || CC == ISD::SETLE ||
+ CC == ISD::SETGT)))
+ return DAG.getSetCC(SL, N->getValueType(0), getHiHalf64(LHS, DAG),
+ DAG.getConstant(CRHSInt >> 32, SL, MVT::i32), CC);
}
}
@@ -16877,8 +17502,9 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
}
case ISD::XOR:
return performXorCombine(N, DCI);
+ case ISD::ANY_EXTEND:
case ISD::ZERO_EXTEND:
- return performZeroExtendCombine(N, DCI);
+ return performZeroOrAnyExtendCombine(N, DCI);
case ISD::SIGN_EXTEND_INREG:
return performSignExtendInRegCombine(N, DCI);
case AMDGPUISD::FP_CLASS:
@@ -17335,12 +17961,14 @@ void SITargetLowering::AddMemOpInit(MachineInstr &MI) const {
// Abandon attempt if the dst size isn't large enough
// - this is in fact an error but this is picked up elsewhere and
// reported correctly.
- uint32_t DstSize =
- TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
+ const TargetRegisterClass *DstRC = TII->getRegClass(MI.getDesc(), DstIdx);
+
+ uint32_t DstSize = TRI.getRegSizeInBits(*DstRC) / 32;
if (DstSize < InitIdx)
return;
} else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
- InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
+ const TargetRegisterClass *DstRC = TII->getRegClass(MI.getDesc(), DstIdx);
+ InitIdx = TRI.getRegSizeInBits(*DstRC) / 32;
} else {
return;
}
@@ -17388,7 +18016,7 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
SDNode *Node) const {
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
- MachineFunction *MF = MI.getParent()->getParent();
+ MachineFunction *MF = MI.getMF();
MachineRegisterInfo &MRI = MF->getRegInfo();
if (TII->isVOP3(MI.getOpcode())) {
@@ -17524,6 +18152,8 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
break;
case 'v':
switch (BitWidth) {
+ case 1:
+ return std::pair(0U, nullptr);
case 16:
RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
: &AMDGPU::VGPR_32_Lo256RegClass;
@@ -17541,6 +18171,8 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
if (!Subtarget->hasMAIInsts())
break;
switch (BitWidth) {
+ case 1:
+ return std::pair(0U, nullptr);
case 16:
RC = &AMDGPU::AGPR_32RegClass;
break;
@@ -18050,6 +18682,11 @@ void SITargetLowering::computeKnownBitsForTargetInstr(
case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
Known.Zero.setHighBits(16);
break;
+ case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
+ // G_AMDGPU_COPY_SCC_VCC converts a uniform boolean in VCC to SGPR s32,
+ // producing exactly 0 or 1.
+ Known.Zero.setHighBits(Known.getBitWidth() - 1);
+ break;
case AMDGPU::G_AMDGPU_SMED3:
case AMDGPU::G_AMDGPU_UMED3: {
auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
@@ -18226,7 +18863,6 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N,
case AMDGPUISD::BUFFER_ATOMIC_INC:
case AMDGPUISD::BUFFER_ATOMIC_DEC:
case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
- case AMDGPUISD::BUFFER_ATOMIC_CSUB:
case AMDGPUISD::BUFFER_ATOMIC_FADD:
case AMDGPUISD::BUFFER_ATOMIC_FMIN:
case AMDGPUISD::BUFFER_ATOMIC_FMAX:
@@ -18378,12 +19014,12 @@ static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
// With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
// allocations work.
if (HasSystemScope) {
- if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics() &&
+ if (Subtarget.hasAgentScopeFineGrainedRemoteMemoryAtomics() &&
RMW->hasMetadata("amdgpu.no.remote.memory"))
return true;
if (Subtarget.hasEmulatedSystemScopeAtomics())
return true;
- } else if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics())
+ } else if (Subtarget.hasAgentScopeFineGrainedRemoteMemoryAtomics())
return true;
return RMW->hasMetadata("amdgpu.no.fine.grained.memory");
@@ -18413,7 +19049,7 @@ getPrivateAtomicExpansionKind(const GCNSubtarget &STI) {
}
TargetLowering::AtomicExpansionKind
-SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
+SITargetLowering::shouldExpandAtomicRMWInIR(const AtomicRMWInst *RMW) const {
unsigned AS = RMW->getPointerAddressSpace();
if (AS == AMDGPUAS::PRIVATE_ADDRESS)
return getPrivateAtomicExpansionKind(*getSubtarget());
@@ -18461,7 +19097,19 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
case AtomicRMWInst::UMax:
case AtomicRMWInst::UMin:
case AtomicRMWInst::UIncWrap:
- case AtomicRMWInst::UDecWrap: {
+ case AtomicRMWInst::UDecWrap:
+ case AtomicRMWInst::USubCond:
+ case AtomicRMWInst::USubSat: {
+ if (Op == AtomicRMWInst::USubCond && !Subtarget->hasCondSubInsts())
+ return AtomicExpansionKind::CmpXChg;
+ if (Op == AtomicRMWInst::USubSat && !Subtarget->hasSubClampInsts())
+ return AtomicExpansionKind::CmpXChg;
+ if (Op == AtomicRMWInst::USubCond || Op == AtomicRMWInst::USubSat) {
+ auto *IT = dyn_cast<IntegerType>(RMW->getType());
+ if (!IT || IT->getBitWidth() != 32)
+ return AtomicExpansionKind::CmpXChg;
+ }
+
if (AMDGPU::isFlatGlobalAddrSpace(AS) ||
AS == AMDGPUAS::BUFFER_FAT_POINTER) {
if (Subtarget->hasEmulatedSystemScopeAtomics())
@@ -18481,7 +19129,7 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
// If fine-grained remote memory works at device scope, we don't need to
// do anything.
if (!HasSystemScope &&
- Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
+ Subtarget->hasAgentScopeFineGrainedRemoteMemoryAtomics())
return atomicSupportedIfLegalIntType(RMW);
// If we are targeting a remote allocated address, it depends what kind of
@@ -18500,7 +19148,7 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
Op == AtomicRMWInst::Xor) {
// Atomic sub/or/xor do not work over PCI express, but atomic add
// does. InstCombine transforms these with 0 to or, so undo that.
- if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
+ if (const Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
ConstVal && ConstVal->isNullValue())
return AtomicExpansionKind::CustomExpand;
}
@@ -18699,7 +19347,8 @@ SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
}
TargetLowering::AtomicExpansionKind
-SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
+SITargetLowering::shouldExpandAtomicCmpXchgInIR(
+ const AtomicCmpXchgInst *CmpX) const {
unsigned AddrSpace = CmpX->getPointerAddressSpace();
if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
return getPrivateAtomicExpansionKind(*getSubtarget());
@@ -18726,8 +19375,11 @@ SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
: &AMDGPU::SReg_32RegClass;
if (!TRI->isSGPRClass(RC) && !isDivergent)
return TRI->getEquivalentSGPRClass(RC);
- if (TRI->isSGPRClass(RC) && isDivergent)
+ if (TRI->isSGPRClass(RC) && isDivergent) {
+ if (Subtarget->hasGFX90AInsts())
+ return TRI->getEquivalentAVClass(RC);
return TRI->getEquivalentVGPRClass(RC);
+ }
return RC;
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 74e58f4..59b8f43 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -45,6 +45,8 @@ public:
LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
unsigned &NumIntermediates, MVT &RegisterVT) const override;
+ MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const;
+
private:
SDValue lowerKernArgParameterPtr(SelectionDAG &DAG, const SDLoc &SL,
SDValue Chain, uint64_t Offset) const;
@@ -77,6 +79,8 @@ private:
SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
SelectionDAG &DAG) const override;
+ SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
+
SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
MVT VT, unsigned Offset) const;
SDValue lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr,
@@ -128,6 +132,7 @@ private:
SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const;
SDValue adjustLoadValueType(unsigned Opcode, MemSDNode *M,
SelectionDAG &DAG, ArrayRef<SDValue> Ops,
bool IsIntrinsic = false) const;
@@ -205,7 +210,7 @@ private:
SDValue performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performXorCombine(SDNode *N, DAGCombinerInfo &DCI) const;
- SDValue performZeroExtendCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performZeroOrAnyExtendCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performSignExtendInRegCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue getCanonicalConstantFP(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
@@ -332,7 +337,7 @@ public:
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override;
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override;
- bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &,
+ void getTgtMemIntrinsic(SmallVectorImpl<IntrinsicInfo> &, const CallBase &,
MachineFunction &MF,
unsigned IntrinsicID) const override;
@@ -555,7 +560,7 @@ public:
Register N1) const override;
bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
- unsigned MaxDepth = 5) const;
+ SDNodeFlags UserFlags = {}, unsigned MaxDepth = 5) const;
bool isCanonicalized(Register Reg, const MachineFunction &MF,
unsigned MaxDepth = 5) const;
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const;
@@ -564,11 +569,12 @@ public:
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts,
const SelectionDAG &DAG, bool SNaN = false,
unsigned Depth = 0) const override;
- AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override;
+ AtomicExpansionKind
+ shouldExpandAtomicRMWInIR(const AtomicRMWInst *) const override;
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
AtomicExpansionKind
- shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override;
+ shouldExpandAtomicCmpXchgInIR(const AtomicCmpXchgInst *AI) const override;
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const;
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override;
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 6dcbced..1118675 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -42,6 +42,7 @@
#include "llvm/TargetParser/TargetParser.h"
using namespace llvm;
+using namespace llvm::AMDGPU;
#define DEBUG_TYPE "si-insert-waitcnts"
@@ -63,58 +64,96 @@ static cl::opt<bool> ForceEmitZeroLoadFlag(
cl::desc("Force all waitcnt load counters to wait until 0"),
cl::init(false), cl::Hidden);
+static cl::opt<bool> ExpertSchedulingModeFlag(
+ "amdgpu-expert-scheduling-mode",
+ cl::desc("Enable expert scheduling mode 2 for all functions (GFX12+ only)"),
+ cl::init(false), cl::Hidden);
+
namespace {
-// Class of object that encapsulates latest instruction counter score
-// associated with the operand. Used for determining whether
-// s_waitcnt instruction needs to be emitted.
-
-enum InstCounterType {
- LOAD_CNT = 0, // VMcnt prior to gfx12.
- DS_CNT, // LKGMcnt prior to gfx12.
- EXP_CNT, //
- STORE_CNT, // VScnt in gfx10/gfx11.
- NUM_NORMAL_INST_CNTS,
- SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only.
- BVH_CNT, // gfx12+ only.
- KM_CNT, // gfx12+ only.
- X_CNT, // gfx1250.
- NUM_EXTENDED_INST_CNTS,
- NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS
-};
-} // namespace
+// Get the maximum wait count value for a given counter type.
+static unsigned getWaitCountMax(const AMDGPU::HardwareLimits &Limits,
+ InstCounterType T) {
+ switch (T) {
+ case LOAD_CNT:
+ return Limits.LoadcntMax;
+ case DS_CNT:
+ return Limits.DscntMax;
+ case EXP_CNT:
+ return Limits.ExpcntMax;
+ case STORE_CNT:
+ return Limits.StorecntMax;
+ case SAMPLE_CNT:
+ return Limits.SamplecntMax;
+ case BVH_CNT:
+ return Limits.BvhcntMax;
+ case KM_CNT:
+ return Limits.KmcntMax;
+ case X_CNT:
+ return Limits.XcntMax;
+ case VA_VDST:
+ return Limits.VaVdstMax;
+ case VM_VSRC:
+ return Limits.VmVsrcMax;
+ default:
+ return 0;
+ }
+}
-namespace llvm {
-template <> struct enum_iteration_traits<InstCounterType> {
- static constexpr bool is_iterable = true;
-};
-} // namespace llvm
+static bool isSoftXcnt(MachineInstr &MI) {
+ return MI.getOpcode() == AMDGPU::S_WAIT_XCNT_soft;
+}
-namespace {
-// Return an iterator over all counters between LOAD_CNT (the first counter)
-// and \c MaxCounter (exclusive, default value yields an enumeration over
-// all counters).
-auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
- return enum_seq(LOAD_CNT, MaxCounter);
+static bool isAtomicRMW(MachineInstr &MI) {
+ return (MI.getDesc().TSFlags & SIInstrFlags::maybeAtomic) && MI.mayLoad() &&
+ MI.mayStore();
}
-using RegInterval = std::pair<int, int>;
-
-struct HardwareLimits {
- unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12.
- unsigned ExpcntMax;
- unsigned DscntMax; // Corresponds to LGKMcnt prior to gfx12.
- unsigned StorecntMax; // Corresponds to VScnt in gfx10/gfx11.
- unsigned SamplecntMax; // gfx12+ only.
- unsigned BvhcntMax; // gfx12+ only.
- unsigned KmcntMax; // gfx12+ only.
- unsigned XcntMax; // gfx1250.
+enum class AtomicRMWState {
+ NewBlock, // Start of a new atomic RMW block
+ InsideBlock, // Middle of an existing block
+ NotInBlock // Not in an atomic RMW block
};
+/// Integer IDs used to track vector memory locations we may have to wait on.
+/// Encoded as u16 chunks:
+///
+/// [0, REGUNITS_END ): MCRegUnit
+/// [LDSDMA_BEGIN, LDSDMA_END ) : LDS DMA IDs
+///
+/// NOTE: The choice of encoding these as "u16 chunks" is arbitrary.
+/// It gives (2 << 16) - 1 entries per category which is more than enough
+/// for all register units. MCPhysReg is u16 so we don't even support >u16
+/// physical register numbers at this time, let alone >u16 register units.
+/// In any case, an assertion in "WaitcntBrackets" ensures REGUNITS_END
+/// is enough for all register units.
+using VMEMID = uint32_t;
+
+enum : VMEMID {
+ TRACKINGID_RANGE_LEN = (1 << 16),
+
+ // Important: MCRegUnits must always be tracked starting from 0, as we
+ // need to be able to convert between a MCRegUnit and a VMEMID freely.
+ REGUNITS_BEGIN = 0,
+ REGUNITS_END = REGUNITS_BEGIN + TRACKINGID_RANGE_LEN,
+
+ // Note for LDSDMA: LDSDMA_BEGIN corresponds to the "common"
+ // entry, which is updated for all LDS DMA operations encountered.
+ // Specific LDS DMA IDs start at LDSDMA_BEGIN + 1.
+ NUM_LDSDMA = TRACKINGID_RANGE_LEN,
+ LDSDMA_BEGIN = REGUNITS_END,
+ LDSDMA_END = LDSDMA_BEGIN + NUM_LDSDMA,
+};
+
+/// Convert a MCRegUnit to a VMEMID.
+static constexpr VMEMID toVMEMID(MCRegUnit RU) {
+ return static_cast<unsigned>(RU);
+}
+
#define AMDGPU_DECLARE_WAIT_EVENTS(DECL) \
- DECL(VMEM_ACCESS) /* vmem read & write */ \
- DECL(VMEM_READ_ACCESS) /* vmem read */ \
+ DECL(VMEM_ACCESS) /* vmem read & write (pre-gfx10), vmem read (gfx10+) */ \
DECL(VMEM_SAMPLER_READ_ACCESS) /* vmem SAMPLER read (gfx12+ only) */ \
DECL(VMEM_BVH_READ_ACCESS) /* vmem BVH read (gfx12+ only) */ \
+ DECL(GLOBAL_INV_ACCESS) /* GLOBAL_INV (gfx12+ only) */ \
DECL(VMEM_WRITE_ACCESS) /* vmem write that is not scratch */ \
DECL(SCRATCH_WRITE_ACCESS) /* vmem write that may be scratch */ \
DECL(VMEM_GROUP) /* vmem group */ \
@@ -129,7 +168,14 @@ struct HardwareLimits {
DECL(EXP_POS_ACCESS) /* write to export position */ \
DECL(EXP_PARAM_ACCESS) /* write to export parameter */ \
DECL(VMW_GPR_LOCK) /* vmem write holding on its data src */ \
- DECL(EXP_LDS_ACCESS) /* read by ldsdir counting as export */
+ DECL(EXP_LDS_ACCESS) /* read by ldsdir counting as export */ \
+ DECL(VGPR_CSMACC_WRITE) /* write VGPR dest in Core/Side-MACC VALU */ \
+ DECL(VGPR_DPMACC_WRITE) /* write VGPR dest in DPMACC VALU */ \
+ DECL(VGPR_TRANS_WRITE) /* write VGPR dest in TRANS VALU */ \
+ DECL(VGPR_XDL_WRITE) /* write VGPR dest in XDL VALU */ \
+ DECL(VGPR_LDS_READ) /* read VGPR source in LDS */ \
+ DECL(VGPR_FLAT_READ) /* read VGPR source in FLAT */ \
+ DECL(VGPR_VMEM_READ) /* read VGPR source in other VMEM */
// clang-format off
#define AMDGPU_EVENT_ENUM(Name) Name,
@@ -138,38 +184,33 @@ enum WaitEventType {
NUM_WAIT_EVENTS
};
#undef AMDGPU_EVENT_ENUM
+} // namespace
+
+namespace llvm {
+template <> struct enum_iteration_traits<WaitEventType> {
+ static constexpr bool is_iterable = true;
+};
+} // namespace llvm
+
+namespace {
+
+/// Return an iterator over all events between VMEM_ACCESS (the first event)
+/// and \c MaxEvent (exclusive, default value yields an enumeration over
+/// all counters).
+auto wait_events(WaitEventType MaxEvent = NUM_WAIT_EVENTS) {
+ return enum_seq(VMEM_ACCESS, MaxEvent);
+}
#define AMDGPU_EVENT_NAME(Name) #Name,
static constexpr StringLiteral WaitEventTypeName[] = {
AMDGPU_DECLARE_WAIT_EVENTS(AMDGPU_EVENT_NAME)
};
#undef AMDGPU_EVENT_NAME
+static constexpr StringLiteral getWaitEventTypeName(WaitEventType Event) {
+ return WaitEventTypeName[Event];
+}
// clang-format on
-// The mapping is:
-// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
-// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
-// NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
-// NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS .. SCC
-// We reserve a fixed number of VGPR slots in the scoring tables for
-// special tokens like SCMEM_LDS (needed for buffer load to LDS).
-enum RegisterMapping {
- SQ_MAX_PGM_VGPRS = 2048, // Maximum programmable VGPRs across all targets.
- AGPR_OFFSET = 512, // Maximum programmable ArchVGPRs across all targets.
- SQ_MAX_PGM_SGPRS = 128, // Maximum programmable SGPRs across all targets.
- // Artificial register slots to track LDS writes into specific LDS locations
- // if a location is known. When slots are exhausted or location is
- // unknown use the first slot. The first slot is also always updated in
- // addition to known location's slot to properly generate waits if dependent
- // instruction's location is unknown.
- FIRST_LDS_VGPR = SQ_MAX_PGM_VGPRS, // Extra slots for LDS stores.
- NUM_LDS_VGPRS = 9, // One more than the stores we track.
- NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_LDS_VGPRS, // Where SGPRs start.
- NUM_ALL_ALLOCATABLE = NUM_ALL_VGPRS + SQ_MAX_PGM_SGPRS,
- // Remaining non-allocatable registers
- SCC = NUM_ALL_ALLOCATABLE
-};
-
// Enumerate different types of result-returning VMEM operations. Although
// s_waitcnt orders them all with a single vmcnt counter, in the absence of
// s_waitcnt only instructions of the same VmemType are guaranteed to write
@@ -187,7 +228,7 @@ enum VmemType {
// Maps values of InstCounterType to the instruction that waits on that
// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
-// returns true.
+// returns true, and does not cover VA_VDST or VM_VSRC.
static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
@@ -224,49 +265,80 @@ VmemType getVmemType(const MachineInstr &Inst) {
return VMEM_NOSAMPLER;
}
-unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
- switch (T) {
- case LOAD_CNT:
- return Wait.LoadCnt;
- case EXP_CNT:
- return Wait.ExpCnt;
- case DS_CNT:
- return Wait.DsCnt;
- case STORE_CNT:
- return Wait.StoreCnt;
- case SAMPLE_CNT:
- return Wait.SampleCnt;
- case BVH_CNT:
- return Wait.BvhCnt;
- case KM_CNT:
- return Wait.KmCnt;
- case X_CNT:
- return Wait.XCnt;
- default:
- llvm_unreachable("bad InstCounterType");
- }
-}
-
void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
- unsigned &WC = getCounterRef(Wait, T);
- WC = std::min(WC, Count);
+ Wait.set(T, std::min(Wait.get(T), Count));
}
-void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
- getCounterRef(Wait, T) = ~0u;
-}
+void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) { Wait.set(T, ~0u); }
-unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
- return getCounterRef(Wait, T);
-}
+/// A small set of events.
+class WaitEventSet {
+ unsigned Mask = 0;
-// Mapping from event to counter according to the table masks.
-InstCounterType eventCounter(const unsigned *masks, WaitEventType E) {
- for (auto T : inst_counter_types()) {
- if (masks[T] & (1 << E))
- return T;
+public:
+ WaitEventSet() = default;
+ explicit constexpr WaitEventSet(WaitEventType Event) {
+ static_assert(NUM_WAIT_EVENTS <= sizeof(Mask) * 8,
+ "Not enough bits in Mask for all the events");
+ Mask |= 1 << Event;
+ }
+ constexpr WaitEventSet(std::initializer_list<WaitEventType> Events) {
+ for (auto &E : Events) {
+ Mask |= 1 << E;
+ }
+ }
+ void insert(const WaitEventType &Event) { Mask |= 1 << Event; }
+ void remove(const WaitEventType &Event) { Mask &= ~(1 << Event); }
+ void remove(const WaitEventSet &Other) { Mask &= ~Other.Mask; }
+ bool contains(const WaitEventType &Event) const {
+ return Mask & (1 << Event);
+ }
+ /// \Returns true if this set contains all elements of \p Other.
+ bool contains(const WaitEventSet &Other) const {
+ return (~Mask & Other.Mask) == 0;
+ }
+ /// \Returns the intersection of this and \p Other.
+ WaitEventSet operator&(const WaitEventSet &Other) const {
+ auto Copy = *this;
+ Copy.Mask &= Other.Mask;
+ return Copy;
+ }
+ /// \Returns the union of this and \p Other.
+ WaitEventSet operator|(const WaitEventSet &Other) const {
+ auto Copy = *this;
+ Copy.Mask |= Other.Mask;
+ return Copy;
+ }
+ /// This set becomes the union of this and \p Other.
+ WaitEventSet &operator|=(const WaitEventSet &Other) {
+ Mask |= Other.Mask;
+ return *this;
+ }
+ /// This set becomes the intersection of this and \p Other.
+ WaitEventSet &operator&=(const WaitEventSet &Other) {
+ Mask &= Other.Mask;
+ return *this;
+ }
+ bool operator==(const WaitEventSet &Other) const {
+ return Mask == Other.Mask;
+ }
+ bool operator!=(const WaitEventSet &Other) const { return !(*this == Other); }
+ bool empty() const { return Mask == 0; }
+ /// \Returns true if the set contains more than one element.
+ bool twoOrMore() const { return Mask & (Mask - 1); }
+ operator bool() const { return !empty(); }
+ void print(raw_ostream &OS) const {
+ ListSeparator LS(", ");
+ for (WaitEventType Event : wait_events()) {
+ OS << LS << getWaitEventTypeName(Event);
+ }
}
- llvm_unreachable("event type has no associated counter");
+ LLVM_DUMP_METHOD void dump() const;
+};
+
+void WaitEventSet::dump() const {
+ print(dbgs());
+ dbgs() << "\n";
}
class WaitcntBrackets;
@@ -279,24 +351,33 @@ class WaitcntBrackets;
// otherwise have had to become.
class WaitcntGenerator {
protected:
- const GCNSubtarget *ST = nullptr;
- const SIInstrInfo *TII = nullptr;
+ const GCNSubtarget &ST;
+ const SIInstrInfo &TII;
AMDGPU::IsaVersion IV;
InstCounterType MaxCounter;
bool OptNone;
+ bool ExpandWaitcntProfiling = false;
+ const AMDGPU::HardwareLimits *Limits = nullptr;
public:
- WaitcntGenerator() = default;
- WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter)
- : ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()),
- IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter),
+ WaitcntGenerator() = delete;
+ WaitcntGenerator(const WaitcntGenerator &) = delete;
+ WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter,
+ const AMDGPU::HardwareLimits *Limits)
+ : ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
+ IV(AMDGPU::getIsaVersion(ST.getCPU())), MaxCounter(MaxCounter),
OptNone(MF.getFunction().hasOptNone() ||
- MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {}
+ MF.getTarget().getOptLevel() == CodeGenOptLevel::None),
+ ExpandWaitcntProfiling(
+ MF.getFunction().hasFnAttribute("amdgpu-expand-waitcnt-profiling")),
+ Limits(Limits) {}
// Return true if the current function should be compiled with no
// optimization.
bool isOptNone() const { return OptNone; }
+ const AMDGPU::HardwareLimits &getLimits() const { return *Limits; }
+
// Edits an existing sequence of wait count instructions according
// to an incoming Waitcnt value, which is itself updated to reflect
// any new wait count instructions which may need to be generated by
@@ -316,39 +397,51 @@ public:
// Transform a soft waitcnt into a normal one.
bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
- // Generates new wait count instructions according to the value of
+ // Generates new wait count instructions according to the value of
// Wait, returning true if any new instructions were created.
+ // ScoreBrackets is used for profiling expansion.
virtual bool createNewWaitcnt(MachineBasicBlock &Block,
MachineBasicBlock::instr_iterator It,
- AMDGPU::Waitcnt Wait) = 0;
+ AMDGPU::Waitcnt Wait,
+ const WaitcntBrackets &ScoreBrackets) = 0;
- // Returns an array of bit masks which can be used to map values in
- // WaitEventType to corresponding counter values in InstCounterType.
- virtual const unsigned *getWaitEventMask() const = 0;
+ // Returns the WaitEventSet that corresponds to counter \p T.
+ virtual const WaitEventSet &getWaitEvents(InstCounterType T) const = 0;
+
+ /// \returns the counter that corresponds to event \p E.
+ InstCounterType getCounterFromEvent(WaitEventType E) const {
+ for (auto T : inst_counter_types()) {
+ if (getWaitEvents(T).contains(E))
+ return T;
+ }
+ llvm_unreachable("event type has no associated counter");
+ }
// Returns a new waitcnt with all counters except VScnt set to 0. If
// IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
virtual ~WaitcntGenerator() = default;
-
- // Create a mask value from the initializer list of wait event types.
- static constexpr unsigned
- eventMask(std::initializer_list<WaitEventType> Events) {
- unsigned Mask = 0;
- for (auto &E : Events)
- Mask |= 1 << E;
-
- return Mask;
- }
};
-class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
-public:
- WaitcntGeneratorPreGFX12() = default;
- WaitcntGeneratorPreGFX12(const MachineFunction &MF)
- : WaitcntGenerator(MF, NUM_NORMAL_INST_CNTS) {}
+class WaitcntGeneratorPreGFX12 final : public WaitcntGenerator {
+ static constexpr const WaitEventSet
+ WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
+ WaitEventSet(
+ {VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS}),
+ WaitEventSet({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
+ WaitEventSet({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK,
+ EXP_PARAM_ACCESS, EXP_POS_ACCESS, EXP_LDS_ACCESS}),
+ WaitEventSet({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
+ WaitEventSet(),
+ WaitEventSet(),
+ WaitEventSet(),
+ WaitEventSet(),
+ WaitEventSet(),
+ WaitEventSet()};
+public:
+ using WaitcntGenerator::WaitcntGenerator;
bool
applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
@@ -356,35 +449,41 @@ public:
bool createNewWaitcnt(MachineBasicBlock &Block,
MachineBasicBlock::instr_iterator It,
- AMDGPU::Waitcnt Wait) override;
-
- const unsigned *getWaitEventMask() const override {
- assert(ST);
+ AMDGPU::Waitcnt Wait,
+ const WaitcntBrackets &ScoreBrackets) override;
- static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
- eventMask({VMEM_ACCESS, VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS,
- VMEM_BVH_READ_ACCESS}),
- eventMask({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
- eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
- EXP_POS_ACCESS, EXP_LDS_ACCESS}),
- eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
- 0,
- 0,
- 0,
- 0};
-
- return WaitEventMaskForInstPreGFX12;
+ const WaitEventSet &getWaitEvents(InstCounterType T) const override {
+ return WaitEventMaskForInstPreGFX12[T];
}
AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
};
-class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
+class WaitcntGeneratorGFX12Plus final : public WaitcntGenerator {
+protected:
+ bool IsExpertMode;
+ static constexpr const WaitEventSet
+ WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
+ WaitEventSet({VMEM_ACCESS, GLOBAL_INV_ACCESS}),
+ WaitEventSet({LDS_ACCESS, GDS_ACCESS}),
+ WaitEventSet({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK,
+ EXP_PARAM_ACCESS, EXP_POS_ACCESS, EXP_LDS_ACCESS}),
+ WaitEventSet({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
+ WaitEventSet({VMEM_SAMPLER_READ_ACCESS}),
+ WaitEventSet({VMEM_BVH_READ_ACCESS}),
+ WaitEventSet({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
+ WaitEventSet({VMEM_GROUP, SMEM_GROUP}),
+ WaitEventSet({VGPR_CSMACC_WRITE, VGPR_DPMACC_WRITE, VGPR_TRANS_WRITE,
+ VGPR_XDL_WRITE}),
+ WaitEventSet({VGPR_LDS_READ, VGPR_FLAT_READ, VGPR_VMEM_READ})};
+
public:
- WaitcntGeneratorGFX12Plus() = default;
+ WaitcntGeneratorGFX12Plus() = delete;
WaitcntGeneratorGFX12Plus(const MachineFunction &MF,
- InstCounterType MaxCounter)
- : WaitcntGenerator(MF, MaxCounter) {}
+ InstCounterType MaxCounter,
+ const AMDGPU::HardwareLimits *Limits,
+ bool IsExpertMode)
+ : WaitcntGenerator(MF, MaxCounter, Limits), IsExpertMode(IsExpertMode) {}
bool
applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
@@ -393,28 +492,22 @@ public:
bool createNewWaitcnt(MachineBasicBlock &Block,
MachineBasicBlock::instr_iterator It,
- AMDGPU::Waitcnt Wait) override;
-
- const unsigned *getWaitEventMask() const override {
- assert(ST);
+ AMDGPU::Waitcnt Wait,
+ const WaitcntBrackets &ScoreBrackets) override;
- static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
- eventMask({VMEM_ACCESS, VMEM_READ_ACCESS}),
- eventMask({LDS_ACCESS, GDS_ACCESS}),
- eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
- EXP_POS_ACCESS, EXP_LDS_ACCESS}),
- eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
- eventMask({VMEM_SAMPLER_READ_ACCESS}),
- eventMask({VMEM_BVH_READ_ACCESS}),
- eventMask({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
- eventMask({VMEM_GROUP, SMEM_GROUP})};
-
- return WaitEventMaskForInstGFX12Plus;
+ const WaitEventSet &getWaitEvents(InstCounterType T) const override {
+ return WaitEventMaskForInstGFX12Plus[T];
}
AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
};
+// Flags indicating which counters should be flushed in a loop preheader.
+struct PreheaderFlushFlags {
+ bool FlushVmCnt = false;
+ bool FlushDsCnt = false;
+};
+
class SIInsertWaitcnts {
public:
const GCNSubtarget *ST;
@@ -423,11 +516,11 @@ public:
const MachineRegisterInfo *MRI = nullptr;
InstCounterType SmemAccessCounter;
InstCounterType MaxCounter;
- const unsigned *WaitEventMaskForInst;
+ bool IsExpertMode = false;
private:
DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
- DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
+ DenseMap<MachineBasicBlock *, PreheaderFlushFlags> PreheadersToFlush;
MachineLoopInfo *MLI;
MachinePostDominatorTree *PDT;
AliasAnalysis *AA = nullptr;
@@ -441,19 +534,18 @@ private:
bool ForceEmitWaitcnt[NUM_INST_CNTS];
- // In any given run of this pass, WCG will point to one of these two
- // generator objects, which must have been re-initialised before use
- // from a value made using a subtarget constructor.
- WaitcntGeneratorPreGFX12 WCGPreGFX12;
- WaitcntGeneratorGFX12Plus WCGGFX12Plus;
+ std::unique_ptr<WaitcntGenerator> WCG;
- WaitcntGenerator *WCG = nullptr;
+ // Remember call and return instructions in the function.
+ DenseSet<MachineInstr *> CallInsts;
+ DenseSet<MachineInstr *> ReturnInsts;
- // S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS
- // message.
- DenseSet<MachineInstr *> ReleaseVGPRInsts;
+ // Remember all S_ENDPGM instructions. The boolean flag is true if there might
+ // be outstanding stores but definitely no outstanding scratch stores, to help
+ // with insertion of DEALLOC_VGPRS messages.
+ DenseMap<MachineInstr *, bool> EndPgmInsts;
- HardwareLimits Limits;
+ AMDGPU::HardwareLimits Limits;
public:
SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,
@@ -464,34 +556,15 @@ public:
(void)ForceVMCounter;
}
- unsigned getWaitCountMax(InstCounterType T) const {
- switch (T) {
- case LOAD_CNT:
- return Limits.LoadcntMax;
- case DS_CNT:
- return Limits.DscntMax;
- case EXP_CNT:
- return Limits.ExpcntMax;
- case STORE_CNT:
- return Limits.StorecntMax;
- case SAMPLE_CNT:
- return Limits.SamplecntMax;
- case BVH_CNT:
- return Limits.BvhcntMax;
- case KM_CNT:
- return Limits.KmcntMax;
- case X_CNT:
- return Limits.XcntMax;
- default:
- break;
- }
- return 0;
- }
+ const AMDGPU::HardwareLimits &getLimits() const { return Limits; }
- bool shouldFlushVmCnt(MachineLoop *ML, const WaitcntBrackets &Brackets);
- bool isPreheaderToFlush(MachineBasicBlock &MBB,
- const WaitcntBrackets &ScoreBrackets);
+ PreheaderFlushFlags getPreheaderFlushFlags(MachineLoop *ML,
+ const WaitcntBrackets &Brackets);
+ PreheaderFlushFlags isPreheaderToFlush(MachineBasicBlock &MBB,
+ const WaitcntBrackets &ScoreBrackets);
bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
+ bool isDSRead(const MachineInstr &MI) const;
+ bool mayStoreIncrementingDSCNT(const MachineInstr &MI) const;
bool run(MachineFunction &MF);
void setForceEmitWaitcnt() {
@@ -524,6 +597,9 @@ public:
ForceEmitWaitcnt[SAMPLE_CNT] = false;
ForceEmitWaitcnt[BVH_CNT] = false;
}
+
+ ForceEmitWaitcnt[VA_VDST] = false;
+ ForceEmitWaitcnt[VM_VSRC] = false;
#endif // NDEBUG
}
@@ -531,8 +607,10 @@ public:
// instruction.
WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
switch (Inst.getOpcode()) {
+ // FIXME: GLOBAL_INV needs to be tracked with xcnt too.
case AMDGPU::GLOBAL_INV:
- return VMEM_READ_ACCESS; // tracked using loadcnt
+ return GLOBAL_INV_ACCESS; // tracked using loadcnt, but doesn't write
+ // VGPRs
case AMDGPU::GLOBAL_WB:
case AMDGPU::GLOBAL_WBINV:
return VMEM_WRITE_ACCESS; // tracked using storecnt
@@ -542,7 +620,7 @@ public:
// Maps VMEM access types to their corresponding WaitEventType.
static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
- VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
+ VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
assert(SIInstrInfo::isVMEM(Inst));
// LDS DMA loads are also stores, but on the LDS side. On the VMEM side
@@ -551,22 +629,41 @@ public:
return VMEM_ACCESS;
if (Inst.mayStore() &&
(!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) {
- // FLAT and SCRATCH instructions may access scratch. Other VMEM
- // instructions do not.
- if (TII->mayAccessScratchThroughFlat(Inst))
+ if (TII->mayAccessScratch(Inst))
return SCRATCH_WRITE_ACCESS;
return VMEM_WRITE_ACCESS;
}
if (!ST->hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))
- return VMEM_READ_ACCESS;
+ return VMEM_ACCESS;
return VmemReadMapping[getVmemType(Inst)];
}
+ std::optional<WaitEventType>
+ getExpertSchedulingEventType(const MachineInstr &Inst) const;
+
+ bool isAsync(const MachineInstr &MI) const {
+ if (!SIInstrInfo::isLDSDMA(MI))
+ return false;
+ if (SIInstrInfo::usesASYNC_CNT(MI))
+ return true;
+ const MachineOperand *Async =
+ TII->getNamedOperand(MI, AMDGPU::OpName::IsAsync);
+ return Async && (Async->getImm());
+ }
+
+ bool isNonAsyncLdsDmaWrite(const MachineInstr &MI) const {
+ return SIInstrInfo::mayWriteLDSThroughDMA(MI) && !isAsync(MI);
+ }
+
+ bool isAsyncLdsDmaWrite(const MachineInstr &MI) const {
+ return SIInstrInfo::mayWriteLDSThroughDMA(MI) && isAsync(MI);
+ }
+
bool isVmemAccess(const MachineInstr &MI) const;
bool generateWaitcntInstBefore(MachineInstr &MI,
WaitcntBrackets &ScoreBrackets,
MachineInstr *OldWaitcntInstr,
- bool FlushVmCnt);
+ PreheaderFlushFlags FlushFlags);
bool generateWaitcnt(AMDGPU::Waitcnt Wait,
MachineBasicBlock::instr_iterator It,
MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
@@ -579,6 +676,16 @@ public:
WaitcntBrackets &ScoreBrackets);
bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
WaitcntBrackets &ScoreBrackets);
+ void setSchedulingMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ bool ExpertMode) const;
+ AtomicRMWState getAtomicRMWState(MachineInstr &MI,
+ AtomicRMWState PrevState) const;
+ const WaitEventSet &getWaitEvents(InstCounterType T) const {
+ return WCG->getWaitEvents(T);
+ }
+ InstCounterType getCounterFromEvent(WaitEventType E) const {
+ return WCG->getCounterFromEvent(E);
+ }
};
// This objects maintains the current score brackets of each wait counter, and
@@ -591,7 +698,30 @@ public:
// "s_waitcnt 0" before use.
class WaitcntBrackets {
public:
- WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {}
+ WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {
+ assert(Context->TRI->getNumRegUnits() < REGUNITS_END);
+ }
+
+#ifndef NDEBUG
+ ~WaitcntBrackets() {
+ unsigned NumUnusedVmem = 0, NumUnusedSGPRs = 0;
+ for (auto &[ID, Val] : VMem) {
+ if (Val.empty())
+ ++NumUnusedVmem;
+ }
+ for (auto &[ID, Val] : SGPRs) {
+ if (Val.empty())
+ ++NumUnusedSGPRs;
+ }
+
+ if (NumUnusedVmem || NumUnusedSGPRs) {
+ errs() << "WaitcntBracket had unused entries at destruction time: "
+ << NumUnusedVmem << " VMem and " << NumUnusedSGPRs
+ << " SGPR unused entries\n";
+ std::abort();
+ }
+ }
+#endif
bool isSmemCounter(InstCounterType T) const {
return T == Context->SmemAccessCounter || T == X_CNT;
@@ -602,6 +732,18 @@ public:
return T == X_CNT ? 1 : 0;
}
+ unsigned getOutstanding(InstCounterType T) const {
+ return ScoreUBs[T] - ScoreLBs[T];
+ }
+
+ bool hasPendingVMEM(VMEMID ID, InstCounterType T) const {
+ return getVMemScore(ID, T) > getScoreLB(T);
+ }
+
+ /// \Return true if we have no score entries for counter \p T.
+ bool empty(InstCounterType T) const { return getScoreRange(T) == 0; }
+
+private:
unsigned getScoreLB(InstCounterType T) const {
assert(T < NUM_INST_CNTS);
return ScoreLBs[T];
@@ -616,53 +758,58 @@ public:
return getScoreUB(T) - getScoreLB(T);
}
- unsigned getRegScore(int GprNo, InstCounterType T) const {
- if (GprNo < NUM_ALL_VGPRS)
- return VgprScores[T][GprNo];
-
- if (GprNo < NUM_ALL_ALLOCATABLE)
- return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS];
+ unsigned getSGPRScore(MCRegUnit RU, InstCounterType T) const {
+ auto It = SGPRs.find(RU);
+ return It != SGPRs.end() ? It->second.Scores[getSgprScoresIdx(T)] : 0;
+ }
- assert(GprNo == SCC);
- return SCCScore;
+ unsigned getVMemScore(VMEMID TID, InstCounterType T) const {
+ auto It = VMem.find(TID);
+ return It != VMem.end() ? It->second.Scores[T] : 0;
}
+public:
bool merge(const WaitcntBrackets &Other);
- RegInterval getRegInterval(const MachineInstr *MI,
- const MachineOperand &Op) const;
-
bool counterOutOfOrder(InstCounterType T) const;
- void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
- void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
-
- void determineWait(InstCounterType T, RegInterval Interval,
- AMDGPU::Waitcnt &Wait) const;
- void determineWait(InstCounterType T, int RegNo,
- AMDGPU::Waitcnt &Wait) const {
- determineWait(T, {RegNo, RegNo + 1}, Wait);
+ void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
+ simplifyWaitcnt(Wait, Wait);
}
+ void simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait,
+ AMDGPU::Waitcnt &UpdateWait) const;
+ void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
+ void simplifyXcnt(const AMDGPU::Waitcnt &CheckWait,
+ AMDGPU::Waitcnt &UpdateWait) const;
+ void simplifyVmVsrc(const AMDGPU::Waitcnt &CheckWait,
+ AMDGPU::Waitcnt &UpdateWait) const;
+
+ void determineWaitForPhysReg(InstCounterType T, MCPhysReg Reg,
+ AMDGPU::Waitcnt &Wait) const;
+ void determineWaitForLDSDMA(InstCounterType T, VMEMID TID,
+ AMDGPU::Waitcnt &Wait) const;
+ AMDGPU::Waitcnt determineAsyncWait(unsigned N);
void tryClearSCCWriteEvent(MachineInstr *Inst);
void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
void applyWaitcnt(InstCounterType T, unsigned Count);
- void applyXcnt(const AMDGPU::Waitcnt &Wait);
void updateByEvent(WaitEventType E, MachineInstr &MI);
+ void recordAsyncMark(MachineInstr &MI);
- unsigned hasPendingEvent() const { return PendingEvents; }
- unsigned hasPendingEvent(WaitEventType E) const {
- return PendingEvents & (1 << E);
+ bool hasPendingEvent() const { return !PendingEvents.empty(); }
+ bool hasPendingEvent(WaitEventType E) const {
+ return PendingEvents.contains(E);
}
- unsigned hasPendingEvent(InstCounterType T) const {
- unsigned HasPending = PendingEvents & Context->WaitEventMaskForInst[T];
- assert((HasPending != 0) == (getScoreRange(T) != 0));
+ bool hasPendingEvent(InstCounterType T) const {
+ bool HasPending = PendingEvents & Context->getWaitEvents(T);
+ assert(HasPending == !empty(T) &&
+ "Expected pending events iff scoreboard is not empty");
return HasPending;
}
bool hasMixedPendingEvents(InstCounterType T) const {
- unsigned Events = hasPendingEvent(T);
+ WaitEventSet Events = PendingEvents & Context->getWaitEvents(T);
// Return true if more than one bit is set in Events.
- return Events & (Events - 1);
+ return Events.twoOrMore();
}
bool hasPendingFlat() const {
@@ -683,33 +830,36 @@ public:
unsigned getPendingGDSWait() const {
return std::min(getScoreUB(DS_CNT) - LastGDS,
- Context->getWaitCountMax(DS_CNT) - 1);
+ getWaitCountMax(Context->getLimits(), DS_CNT) - 1);
}
void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }
// Return true if there might be pending writes to the vgpr-interval by VMEM
// instructions with types different from V.
- bool hasOtherPendingVmemTypes(RegInterval Interval, VmemType V) const {
- for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
- assert(RegNo < NUM_ALL_VGPRS);
- if (VgprVmemTypes[RegNo] & ~(1 << V))
+ bool hasOtherPendingVmemTypes(MCPhysReg Reg, VmemType V) const {
+ for (MCRegUnit RU : regunits(Reg)) {
+ auto It = VMem.find(toVMEMID(RU));
+ if (It != VMem.end() && (It->second.VMEMTypes & ~(1 << V)))
return true;
}
return false;
}
- void clearVgprVmemTypes(RegInterval Interval) {
- for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
- assert(RegNo < NUM_ALL_VGPRS);
- VgprVmemTypes[RegNo] = 0;
+ void clearVgprVmemTypes(MCPhysReg Reg) {
+ for (MCRegUnit RU : regunits(Reg)) {
+ if (auto It = VMem.find(toVMEMID(RU)); It != VMem.end()) {
+ It->second.VMEMTypes = 0;
+ if (It->second.empty())
+ VMem.erase(It);
+ }
}
}
void setStateOnFunctionEntryOrReturn() {
- setScoreUB(STORE_CNT,
- getScoreUB(STORE_CNT) + Context->getWaitCountMax(STORE_CNT));
- PendingEvents |= Context->WaitEventMaskForInst[STORE_CNT];
+ setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) +
+ getWaitCountMax(Context->getLimits(), STORE_CNT));
+ PendingEvents |= Context->getWaitEvents(STORE_CNT);
}
ArrayRef<const MachineInstr *> getLDSDMAStores() const {
@@ -718,11 +868,15 @@ public:
bool hasPointSampleAccel(const MachineInstr &MI) const;
bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,
- RegInterval Interval) const;
+ MCPhysReg RU) const;
void print(raw_ostream &) const;
void dump() const { print(dbgs()); }
+ // Free up memory by removing empty entries from the DenseMap that track event
+ // scores.
+ void purgeEmptyTrackingData();
+
private:
struct MergeInfo {
unsigned OldLB;
@@ -730,8 +884,27 @@ private:
unsigned MyShift;
unsigned OtherShift;
};
+
+ using CounterValueArray = std::array<unsigned, NUM_INST_CNTS>;
+
+ void determineWaitForScore(InstCounterType T, unsigned Score,
+ AMDGPU::Waitcnt &Wait) const;
+
static bool mergeScore(const MergeInfo &M, unsigned &Score,
unsigned OtherScore);
+ bool mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
+ ArrayRef<CounterValueArray> OtherMarks);
+
+ iterator_range<MCRegUnitIterator> regunits(MCPhysReg Reg) const {
+ assert(Reg != AMDGPU::SCC && "Shouldn't be used on SCC");
+ if (!Context->TRI->isInAllocatableClass(Reg))
+ return {{}, {}};
+ const TargetRegisterClass *RC = Context->TRI->getPhysRegBaseClass(Reg);
+ unsigned Size = Context->TRI->getRegSizeInBits(*RC);
+ if (Size == 16 && Context->ST->hasD16Writes32BitVgpr())
+ Reg = Context->TRI->get32BitRegister(Reg);
+ return Context->TRI->regunits(Reg);
+ }
void setScoreLB(InstCounterType T, unsigned Val) {
assert(T < NUM_INST_CNTS);
@@ -745,49 +918,95 @@ private:
if (T != EXP_CNT)
return;
- if (getScoreRange(EXP_CNT) > Context->getWaitCountMax(EXP_CNT))
- ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - Context->getWaitCountMax(EXP_CNT);
+ if (getScoreRange(EXP_CNT) > getWaitCountMax(Context->getLimits(), EXP_CNT))
+ ScoreLBs[EXP_CNT] =
+ ScoreUBs[EXP_CNT] - getWaitCountMax(Context->getLimits(), EXP_CNT);
+ }
+
+ void setRegScore(MCPhysReg Reg, InstCounterType T, unsigned Val) {
+ const SIRegisterInfo *TRI = Context->TRI;
+ if (Reg == AMDGPU::SCC) {
+ SCCScore = Val;
+ } else if (TRI->isVectorRegister(*Context->MRI, Reg)) {
+ for (MCRegUnit RU : regunits(Reg))
+ VMem[toVMEMID(RU)].Scores[T] = Val;
+ } else if (TRI->isSGPRReg(*Context->MRI, Reg)) {
+ auto STy = getSgprScoresIdx(T);
+ for (MCRegUnit RU : regunits(Reg))
+ SGPRs[RU].Scores[STy] = Val;
+ } else {
+ llvm_unreachable("Register cannot be tracked/unknown register!");
+ }
}
- void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
- setScoreByInterval({GprNo, GprNo + 1}, T, Val);
+ void setVMemScore(VMEMID TID, InstCounterType T, unsigned Val) {
+ VMem[TID].Scores[T] = Val;
}
- void setScoreByInterval(RegInterval Interval, InstCounterType CntTy,
- unsigned Score);
-
- void setScoreByOperand(const MachineInstr *MI, const MachineOperand &Op,
- InstCounterType CntTy, unsigned Val);
+ void setScoreByOperand(const MachineOperand &Op, InstCounterType CntTy,
+ unsigned Val);
const SIInsertWaitcnts *Context;
unsigned ScoreLBs[NUM_INST_CNTS] = {0};
unsigned ScoreUBs[NUM_INST_CNTS] = {0};
- unsigned PendingEvents = 0;
+ WaitEventSet PendingEvents;
// Remember the last flat memory operation.
unsigned LastFlat[NUM_INST_CNTS] = {0};
// Remember the last GDS operation.
unsigned LastGDS = 0;
- // wait_cnt scores for every vgpr.
- // Keep track of the VgprUB and SgprUB to make merge at join efficient.
- int VgprUB = -1;
- int SgprUB = -1;
- unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
- // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
- // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
- // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the
- // X_CNT score.
- unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}};
+
+ // The score tracking logic is fragmented as follows:
+ // - VMem: VGPR RegUnits and LDS DMA IDs, see the VMEMID encoding.
+ // - SGPRs: SGPR RegUnits
+ // - SCC: Non-allocatable and not general purpose: not a SGPR.
+ //
+ // For the VMem case, if the key is within the range of LDS DMA IDs,
+ // then the corresponding index into the `LDSDMAStores` vector below is:
+ // Key - LDSDMA_BEGIN - 1
+ // This is because LDSDMA_BEGIN is a generic entry and does not have an
+ // associated MachineInstr.
+ //
+ // TODO: Could we track SCC alongside SGPRs so it's not longer a special case?
+
+ struct VMEMInfo {
+ // Scores for all instruction counters. Zero-initialized.
+ CounterValueArray Scores{};
+ // Bitmask of the VmemTypes of VMEM instructions for this VGPR.
+ unsigned VMEMTypes = 0;
+
+ bool empty() const { return all_of(Scores, equal_to(0)) && !VMEMTypes; }
+ };
+
+ struct SGPRInfo {
+ // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
+ // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
+ // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps
+ // the X_CNT score.
+ std::array<unsigned, 2> Scores = {0};
+
+ bool empty() const { return !Scores[0] && !Scores[1]; }
+ };
+
+ DenseMap<VMEMID, VMEMInfo> VMem; // VGPR + LDS DMA
+ DenseMap<MCRegUnit, SGPRInfo> SGPRs;
+
// Reg score for SCC.
unsigned SCCScore = 0;
// The unique instruction that has an SCC write pending, if there is one.
const MachineInstr *PendingSCCWrite = nullptr;
- // Bitmask of the VmemTypes of VMEM instructions that might have a pending
- // write to each vgpr.
- unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
+
// Store representative LDS DMA operations. The only useful info here is
// alias info. One store is kept per unique AAInfo.
- SmallVector<const MachineInstr *, NUM_LDS_VGPRS - 1> LDSDMAStores;
+ SmallVector<const MachineInstr *> LDSDMAStores;
+
+ // State of all counters at each async mark encountered so far.
+ SmallVector<CounterValueArray> AsyncMarks;
+ static constexpr unsigned MaxAsyncMarks = 16;
+
+ // Track the upper bound score for async operations that are not part of a
+ // mark yet. Initialized to all zeros.
+ CounterValueArray AsyncScore{};
};
class SIInsertWaitcntsLegacy : public MachineFunctionPass {
@@ -813,82 +1032,9 @@ public:
} // end anonymous namespace
-RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
- const MachineOperand &Op) const {
- if (Op.getReg() == AMDGPU::SCC)
- return {SCC, SCC + 1};
-
- const SIRegisterInfo *TRI = Context->TRI;
- const MachineRegisterInfo *MRI = Context->MRI;
-
- if (!TRI->isInAllocatableClass(Op.getReg()))
- return {-1, -1};
-
- // A use via a PW operand does not need a waitcnt.
- // A partial write is not a WAW.
- assert(!Op.getSubReg() || !Op.isUndef());
-
- RegInterval Result;
-
- MCRegister MCReg = AMDGPU::getMCReg(Op.getReg(), *Context->ST);
- unsigned RegIdx = TRI->getHWRegIndex(MCReg);
-
- const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg());
- unsigned Size = TRI->getRegSizeInBits(*RC);
-
- // AGPRs/VGPRs are tracked every 16 bits, SGPRs by 32 bits
- if (TRI->isVectorRegister(*MRI, Op.getReg())) {
- unsigned Reg = RegIdx << 1 | (AMDGPU::isHi16Reg(MCReg, *TRI) ? 1 : 0);
- assert(!Context->ST->hasMAIInsts() || Reg < AGPR_OFFSET);
- Result.first = Reg;
- if (TRI->isAGPR(*MRI, Op.getReg()))
- Result.first += AGPR_OFFSET;
- assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
- assert(Size % 16 == 0);
- Result.second = Result.first + (Size / 16);
-
- if (Size == 16 && Context->ST->hasD16Writes32BitVgpr()) {
- // Regardless of which lo16/hi16 is used, consider the full 32-bit
- // register used.
- if (AMDGPU::isHi16Reg(MCReg, *TRI))
- Result.first -= 1;
- else
- Result.second += 1;
- }
- } else if (TRI->isSGPRReg(*MRI, Op.getReg()) && RegIdx < SQ_MAX_PGM_SGPRS) {
- // SGPRs including VCC, TTMPs and EXEC but excluding read-only scalar
- // sources like SRC_PRIVATE_BASE.
- Result.first = RegIdx + NUM_ALL_VGPRS;
- Result.second = Result.first + divideCeil(Size, 32);
- } else {
- return {-1, -1};
- }
-
- return Result;
-}
-
-void WaitcntBrackets::setScoreByInterval(RegInterval Interval,
- InstCounterType CntTy,
- unsigned Score) {
- for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
- if (RegNo < NUM_ALL_VGPRS) {
- VgprUB = std::max(VgprUB, RegNo);
- VgprScores[CntTy][RegNo] = Score;
- } else if (RegNo < NUM_ALL_ALLOCATABLE) {
- SgprUB = std::max(SgprUB, RegNo - NUM_ALL_VGPRS);
- SgprScores[getSgprScoresIdx(CntTy)][RegNo - NUM_ALL_VGPRS] = Score;
- } else {
- assert(RegNo == SCC);
- SCCScore = Score;
- }
- }
-}
-
-void WaitcntBrackets::setScoreByOperand(const MachineInstr *MI,
- const MachineOperand &Op,
+void WaitcntBrackets::setScoreByOperand(const MachineOperand &Op,
InstCounterType CntTy, unsigned Score) {
- RegInterval Interval = getRegInterval(MI, Op);
- setScoreByInterval(Interval, CntTy, Score);
+ setRegScore(Op.getReg().asMCReg(), CntTy, Score);
}
// Return true if the subtarget is one that enables Point Sample Acceleration
@@ -911,16 +1057,17 @@ bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const {
// one that has outstanding writes to vmem-types different than VMEM_NOSAMPLER
// (this is the type that a point sample accelerated instruction effectively
// becomes)
-bool WaitcntBrackets::hasPointSamplePendingVmemTypes(
- const MachineInstr &MI, RegInterval Interval) const {
+bool WaitcntBrackets::hasPointSamplePendingVmemTypes(const MachineInstr &MI,
+ MCPhysReg Reg) const {
if (!hasPointSampleAccel(MI))
return false;
- return hasOtherPendingVmemTypes(Interval, VMEM_NOSAMPLER);
+ return hasOtherPendingVmemTypes(Reg, VMEM_NOSAMPLER);
}
void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
- InstCounterType T = eventCounter(Context->WaitEventMaskForInst, E);
+ InstCounterType T = Context->getCounterFromEvent(E);
+ assert(T < Context->MaxCounter);
unsigned UB = getScoreUB(T);
unsigned CurrScore = UB + 1;
@@ -929,7 +1076,7 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
// PendingEvents and ScoreUB need to be update regardless if this event
// changes the score of a register or not.
// Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
- PendingEvents |= 1 << E;
+ PendingEvents.insert(E);
setScoreUB(T, CurrScore);
const SIRegisterInfo *TRI = Context->TRI;
@@ -943,57 +1090,52 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
// All GDS operations must protect their address register (same as
// export.)
if (const auto *AddrOp = TII->getNamedOperand(Inst, AMDGPU::OpName::addr))
- setScoreByOperand(&Inst, *AddrOp, EXP_CNT, CurrScore);
+ setScoreByOperand(*AddrOp, EXP_CNT, CurrScore);
if (Inst.mayStore()) {
if (const auto *Data0 =
TII->getNamedOperand(Inst, AMDGPU::OpName::data0))
- setScoreByOperand(&Inst, *Data0, EXP_CNT, CurrScore);
+ setScoreByOperand(*Data0, EXP_CNT, CurrScore);
if (const auto *Data1 =
TII->getNamedOperand(Inst, AMDGPU::OpName::data1))
- setScoreByOperand(&Inst, *Data1, EXP_CNT, CurrScore);
+ setScoreByOperand(*Data1, EXP_CNT, CurrScore);
} else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) &&
Inst.getOpcode() != AMDGPU::DS_APPEND &&
Inst.getOpcode() != AMDGPU::DS_CONSUME &&
Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
for (const MachineOperand &Op : Inst.all_uses()) {
if (TRI->isVectorRegister(*MRI, Op.getReg()))
- setScoreByOperand(&Inst, Op, EXP_CNT, CurrScore);
+ setScoreByOperand(Op, EXP_CNT, CurrScore);
}
}
} else if (TII->isFLAT(Inst)) {
if (Inst.mayStore()) {
- setScoreByOperand(&Inst,
- *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
+ setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
EXP_CNT, CurrScore);
} else if (SIInstrInfo::isAtomicRet(Inst)) {
- setScoreByOperand(&Inst,
- *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
+ setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
EXP_CNT, CurrScore);
}
} else if (TII->isMIMG(Inst)) {
if (Inst.mayStore()) {
- setScoreByOperand(&Inst, Inst.getOperand(0), EXP_CNT, CurrScore);
+ setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore);
} else if (SIInstrInfo::isAtomicRet(Inst)) {
- setScoreByOperand(&Inst,
- *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
+ setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
EXP_CNT, CurrScore);
}
} else if (TII->isMTBUF(Inst)) {
if (Inst.mayStore())
- setScoreByOperand(&Inst, Inst.getOperand(0), EXP_CNT, CurrScore);
+ setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore);
} else if (TII->isMUBUF(Inst)) {
if (Inst.mayStore()) {
- setScoreByOperand(&Inst, Inst.getOperand(0), EXP_CNT, CurrScore);
+ setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore);
} else if (SIInstrInfo::isAtomicRet(Inst)) {
- setScoreByOperand(&Inst,
- *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
+ setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
EXP_CNT, CurrScore);
}
} else if (TII->isLDSDIR(Inst)) {
// LDSDIR instructions attach the score to the destination.
- setScoreByOperand(&Inst,
- *TII->getNamedOperand(Inst, AMDGPU::OpName::vdst),
+ setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::vdst),
EXP_CNT, CurrScore);
} else {
if (TII->isEXP(Inst)) {
@@ -1003,27 +1145,37 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
// score.
for (MachineOperand &DefMO : Inst.all_defs()) {
if (TRI->isVGPR(*MRI, DefMO.getReg())) {
- setScoreByOperand(&Inst, DefMO, EXP_CNT, CurrScore);
+ setScoreByOperand(DefMO, EXP_CNT, CurrScore);
}
}
}
for (const MachineOperand &Op : Inst.all_uses()) {
if (TRI->isVectorRegister(*MRI, Op.getReg()))
- setScoreByOperand(&Inst, Op, EXP_CNT, CurrScore);
+ setScoreByOperand(Op, EXP_CNT, CurrScore);
}
}
} else if (T == X_CNT) {
WaitEventType OtherEvent = E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP;
- if (PendingEvents & (1 << OtherEvent)) {
+ if (PendingEvents.contains(OtherEvent)) {
// Hardware inserts an implicit xcnt between interleaved
// SMEM and VMEM operations. So there will never be
// outstanding address translations for both SMEM and
// VMEM at the same time.
setScoreLB(T, getScoreUB(T) - 1);
- PendingEvents &= ~(1 << OtherEvent);
+ PendingEvents.remove(OtherEvent);
}
for (const MachineOperand &Op : Inst.all_uses())
- setScoreByOperand(&Inst, Op, T, CurrScore);
+ setScoreByOperand(Op, T, CurrScore);
+ } else if (T == VA_VDST || T == VM_VSRC) {
+ // Match the score to the VGPR destination or source registers as
+ // appropriate
+ for (const MachineOperand &Op : Inst.operands()) {
+ if (!Op.isReg() || (T == VA_VDST && Op.isUse()) ||
+ (T == VM_VSRC && Op.isDef()))
+ continue;
+ if (TRI->isVectorRegister(*Context->MRI, Op.getReg()))
+ setScoreByOperand(Op, T, CurrScore);
+ }
} else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
// Match the score to the destination registers.
//
@@ -1035,9 +1187,8 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
// Special cases where implicit register defs exists, such as M0 or VCC,
// but none with memory instructions.
for (const MachineOperand &Op : Inst.defs()) {
- RegInterval Interval = getRegInterval(&Inst, Op);
if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) {
- if (Interval.first >= NUM_ALL_VGPRS)
+ if (!TRI->isVectorRegister(*MRI, Op.getReg())) // TODO: add wrapper
continue;
if (updateVMCntOnly(Inst)) {
// updateVMCntOnly should only leave us with VGPRs
@@ -1050,16 +1201,20 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
// this with another potential dependency
if (hasPointSampleAccel(Inst))
TypesMask |= 1 << VMEM_NOSAMPLER;
- for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
- VgprVmemTypes[RegNo] |= TypesMask;
+ for (MCRegUnit RU : regunits(Op.getReg().asMCReg()))
+ VMem[toVMEMID(RU)].VMEMTypes |= TypesMask;
}
}
- setScoreByInterval(Interval, T, CurrScore);
+ setScoreByOperand(Op, T, CurrScore);
}
if (Inst.mayStore() &&
- (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) {
+ (TII->isDS(Inst) || Context->isNonAsyncLdsDmaWrite(Inst))) {
// MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
// written can be accessed. A load from LDS to VMEM does not need a wait.
+ //
+ // The "Slot" is the offset from LDSDMA_BEGIN. If it's non-zero, then
+ // there is a MachineInstr in LDSDMAStores used to track this LDSDMA
+ // store. The "Slot" is the index into LDSDMAStores + 1.
unsigned Slot = 0;
for (const auto *MemOp : Inst.memoperands()) {
if (!MemOp->isStore() ||
@@ -1072,9 +1227,7 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
// original memory object and practically produced in the module LDS
// lowering pass. If there is no scope available we will not be able
// to disambiguate LDS aliasing as after the module lowering all LDS
- // is squashed into a single big object. Do not attempt to use one of
- // the limited LDSDMAStores for something we will not be able to use
- // anyway.
+ // is squashed into a single big object.
if (!AAI || !AAI.Scope)
break;
for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
@@ -1085,61 +1238,93 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
}
}
}
- if (Slot || LDSDMAStores.size() == NUM_LDS_VGPRS - 1)
+ if (Slot)
break;
+ // The slot may not be valid because it can be >= NUM_LDSDMA which
+ // means the scoreboard cannot track it. We still want to preserve the
+ // MI in order to check alias information, though.
LDSDMAStores.push_back(&Inst);
Slot = LDSDMAStores.size();
break;
}
- setRegScore(FIRST_LDS_VGPR + Slot, T, CurrScore);
- if (Slot)
- setRegScore(FIRST_LDS_VGPR, T, CurrScore);
+ setVMemScore(LDSDMA_BEGIN, T, CurrScore);
+ if (Slot && Slot < NUM_LDSDMA)
+ setVMemScore(LDSDMA_BEGIN + Slot, T, CurrScore);
+ }
+
+ // FIXME: Not supported on GFX12 yet. Newer async operations use other
+ // counters too, so will need a map from instruction or event types to
+ // counter types.
+ if (Context->isAsyncLdsDmaWrite(Inst) && T == LOAD_CNT) {
+ assert(!SIInstrInfo::usesASYNC_CNT(Inst));
+ AsyncScore[T] = CurrScore;
}
if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) {
- setRegScore(SCC, T, CurrScore);
+ setRegScore(AMDGPU::SCC, T, CurrScore);
PendingSCCWrite = &Inst;
}
}
}
+void WaitcntBrackets::recordAsyncMark(MachineInstr &Inst) {
+ // In the absence of loops, AsyncMarks can grow linearly with the program
+ // until we encounter an ASYNCMARK_WAIT. We could drop the oldest mark above a
+ // limit every time we push a new mark, but that seems like unnecessary work
+ // in practical cases. We do separately truncate the array when processing a
+ // loop, which should be sufficient.
+ AsyncMarks.push_back(AsyncScore);
+ AsyncScore = {};
+ LLVM_DEBUG({
+ dbgs() << "recordAsyncMark:\n" << Inst;
+ for (const auto &Mark : AsyncMarks) {
+ llvm::interleaveComma(Mark, dbgs());
+ dbgs() << '\n';
+ }
+ });
+}
+
void WaitcntBrackets::print(raw_ostream &OS) const {
const GCNSubtarget *ST = Context->ST;
- OS << '\n';
for (auto T : inst_counter_types(Context->MaxCounter)) {
unsigned SR = getScoreRange(T);
-
switch (T) {
case LOAD_CNT:
OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
- << SR << "): ";
+ << SR << "):";
break;
case DS_CNT:
OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
- << SR << "): ";
+ << SR << "):";
break;
case EXP_CNT:
- OS << " EXP_CNT(" << SR << "): ";
+ OS << " EXP_CNT(" << SR << "):";
break;
case STORE_CNT:
OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
- << SR << "): ";
+ << SR << "):";
break;
case SAMPLE_CNT:
- OS << " SAMPLE_CNT(" << SR << "): ";
+ OS << " SAMPLE_CNT(" << SR << "):";
break;
case BVH_CNT:
- OS << " BVH_CNT(" << SR << "): ";
+ OS << " BVH_CNT(" << SR << "):";
break;
case KM_CNT:
- OS << " KM_CNT(" << SR << "): ";
+ OS << " KM_CNT(" << SR << "):";
break;
case X_CNT:
- OS << " X_CNT(" << SR << "): ";
+ OS << " X_CNT(" << SR << "):";
+ break;
+ case VA_VDST:
+ OS << " VA_VDST(" << SR << "): ";
+ break;
+ case VM_VSRC:
+ OS << " VM_VSRC(" << SR << "): ";
break;
default:
- OS << " UNKNOWN(" << SR << "): ";
+ OS << " UNKNOWN(" << SR << "):";
break;
}
@@ -1147,29 +1332,38 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
// Print vgpr scores.
unsigned LB = getScoreLB(T);
- for (int J = 0; J <= VgprUB; J++) {
- unsigned RegScore = getRegScore(J, T);
+ SmallVector<VMEMID> SortedVMEMIDs(VMem.keys());
+ sort(SortedVMEMIDs);
+
+ for (auto ID : SortedVMEMIDs) {
+ unsigned RegScore = VMem.at(ID).Scores[T];
if (RegScore <= LB)
continue;
unsigned RelScore = RegScore - LB - 1;
- if (J < FIRST_LDS_VGPR) {
- OS << RelScore << ":v" << J << " ";
+ if (ID < REGUNITS_END) {
+ OS << ' ' << RelScore << ":vRU" << ID;
} else {
- OS << RelScore << ":ds ";
+ assert(ID >= LDSDMA_BEGIN && ID < LDSDMA_END &&
+ "Unhandled/unexpected ID value!");
+ OS << ' ' << RelScore << ":LDSDMA" << ID;
}
}
+
// Also need to print sgpr scores for lgkm_cnt or xcnt.
if (isSmemCounter(T)) {
- for (int J = 0; J <= SgprUB; J++) {
- unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, T);
+ SmallVector<MCRegUnit> SortedSMEMIDs(SGPRs.keys());
+ sort(SortedSMEMIDs);
+ for (auto ID : SortedSMEMIDs) {
+ unsigned RegScore = SGPRs.at(ID).Scores[getSgprScoresIdx(T)];
if (RegScore <= LB)
continue;
unsigned RelScore = RegScore - LB - 1;
- OS << RelScore << ":s" << J << " ";
+ OS << ' ' << RelScore << ":sRU" << static_cast<unsigned>(ID);
}
}
+
if (T == KM_CNT && SCCScore > 0)
- OS << SCCScore << ":scc ";
+ OS << ' ' << SCCScore << ":scc";
}
OS << '\n';
}
@@ -1187,20 +1381,70 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
}
OS << '\n';
+ OS << "Async score: ";
+ if (AsyncScore.empty())
+ OS << "none";
+ else
+ llvm::interleaveComma(AsyncScore, OS);
+ OS << '\n';
+
+ OS << "Async marks: " << AsyncMarks.size() << '\n';
+
+ for (const auto &Mark : AsyncMarks) {
+ for (auto T : inst_counter_types()) {
+ unsigned MarkedScore = Mark[T];
+ switch (T) {
+ case LOAD_CNT:
+ OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM")
+ << "_CNT: " << MarkedScore;
+ break;
+ case DS_CNT:
+ OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM")
+ << "_CNT: " << MarkedScore;
+ break;
+ case EXP_CNT:
+ OS << " EXP_CNT: " << MarkedScore;
+ break;
+ case STORE_CNT:
+ OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS")
+ << "_CNT: " << MarkedScore;
+ break;
+ case SAMPLE_CNT:
+ OS << " SAMPLE_CNT: " << MarkedScore;
+ break;
+ case BVH_CNT:
+ OS << " BVH_CNT: " << MarkedScore;
+ break;
+ case KM_CNT:
+ OS << " KM_CNT: " << MarkedScore;
+ break;
+ case X_CNT:
+ OS << " X_CNT: " << MarkedScore;
+ break;
+ default:
+ OS << " UNKNOWN: " << MarkedScore;
+ break;
+ }
+ }
+ OS << '\n';
+ }
OS << '\n';
}
-/// Simplify the waitcnt, in the sense of removing redundant counts, and return
-/// whether a waitcnt instruction is needed at all.
-void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
- simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt);
- simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
- simplifyWaitcnt(DS_CNT, Wait.DsCnt);
- simplifyWaitcnt(STORE_CNT, Wait.StoreCnt);
- simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
- simplifyWaitcnt(BVH_CNT, Wait.BvhCnt);
- simplifyWaitcnt(KM_CNT, Wait.KmCnt);
- simplifyWaitcnt(X_CNT, Wait.XCnt);
+/// Simplify \p UpdateWait by removing waits that are redundant based on the
+/// current WaitcntBrackets and any other waits specified in \p CheckWait.
+void WaitcntBrackets::simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait,
+ AMDGPU::Waitcnt &UpdateWait) const {
+ simplifyWaitcnt(LOAD_CNT, UpdateWait.LoadCnt);
+ simplifyWaitcnt(EXP_CNT, UpdateWait.ExpCnt);
+ simplifyWaitcnt(DS_CNT, UpdateWait.DsCnt);
+ simplifyWaitcnt(STORE_CNT, UpdateWait.StoreCnt);
+ simplifyWaitcnt(SAMPLE_CNT, UpdateWait.SampleCnt);
+ simplifyWaitcnt(BVH_CNT, UpdateWait.BvhCnt);
+ simplifyWaitcnt(KM_CNT, UpdateWait.KmCnt);
+ simplifyXcnt(CheckWait, UpdateWait);
+ simplifyWaitcnt(VA_VDST, UpdateWait.VaVdst);
+ simplifyVmVsrc(CheckWait, UpdateWait);
}
void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
@@ -1212,52 +1456,155 @@ void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
Count = ~0u;
}
-void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval,
- AMDGPU::Waitcnt &Wait) const {
+void WaitcntBrackets::simplifyXcnt(const AMDGPU::Waitcnt &CheckWait,
+ AMDGPU::Waitcnt &UpdateWait) const {
+ // Try to simplify xcnt further by checking for joint kmcnt and loadcnt
+ // optimizations. On entry to a block with multiple predescessors, there may
+ // be pending SMEM and VMEM events active at the same time.
+ // In such cases, only clear one active event at a time.
+ // TODO: Revisit xcnt optimizations for gfx1250.
+ // Wait on XCNT is redundant if we are already waiting for a load to complete.
+ // SMEM can return out of order, so only omit XCNT wait if we are waiting till
+ // zero.
+ if (CheckWait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP))
+ UpdateWait.XCnt = ~0u;
+ // If we have pending store we cannot optimize XCnt because we do not wait for
+ // stores. VMEM loads retun in order, so if we only have loads XCnt is
+ // decremented to the same number as LOADCnt.
+ if (CheckWait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
+ !hasPendingEvent(STORE_CNT) && CheckWait.XCnt >= CheckWait.LoadCnt)
+ UpdateWait.XCnt = ~0u;
+ simplifyWaitcnt(X_CNT, UpdateWait.XCnt);
+}
+
+void WaitcntBrackets::simplifyVmVsrc(const AMDGPU::Waitcnt &CheckWait,
+ AMDGPU::Waitcnt &UpdateWait) const {
+ // Waiting for some counters implies waiting for VM_VSRC, since an
+ // instruction that decrements a counter on completion would have
+ // decremented VM_VSRC once its VGPR operands had been read.
+ if (CheckWait.VmVsrc >=
+ std::min({CheckWait.LoadCnt, CheckWait.StoreCnt, CheckWait.SampleCnt,
+ CheckWait.BvhCnt, CheckWait.DsCnt}))
+ UpdateWait.VmVsrc = ~0u;
+ simplifyWaitcnt(VM_VSRC, UpdateWait.VmVsrc);
+}
+
+void WaitcntBrackets::purgeEmptyTrackingData() {
+ for (auto &[K, V] : make_early_inc_range(VMem)) {
+ if (V.empty())
+ VMem.erase(K);
+ }
+ for (auto &[K, V] : make_early_inc_range(SGPRs)) {
+ if (V.empty())
+ SGPRs.erase(K);
+ }
+}
+
+void WaitcntBrackets::determineWaitForScore(InstCounterType T,
+ unsigned ScoreToWait,
+ AMDGPU::Waitcnt &Wait) const {
const unsigned LB = getScoreLB(T);
const unsigned UB = getScoreUB(T);
- for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
- unsigned ScoreToWait = getRegScore(RegNo, T);
-
- // If the score of src_operand falls within the bracket, we need an
- // s_waitcnt instruction.
- if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
- if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
- !Context->ST->hasFlatLgkmVMemCountInOrder()) {
- // If there is a pending FLAT operation, and this is a VMem or LGKM
- // waitcnt and the target can report early completion, then we need
- // to force a waitcnt 0.
- addWait(Wait, T, 0);
- } else if (counterOutOfOrder(T)) {
- // Counter can get decremented out-of-order when there
- // are multiple types event in the bracket. Also emit an s_wait counter
- // with a conservative value of 0 for the counter.
- addWait(Wait, T, 0);
- } else {
- // If a counter has been maxed out avoid overflow by waiting for
- // MAX(CounterType) - 1 instead.
- unsigned NeededWait =
- std::min(UB - ScoreToWait, Context->getWaitCountMax(T) - 1);
- addWait(Wait, T, NeededWait);
- }
+
+ // If the score falls within the bracket, we need a waitcnt.
+ if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
+ if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
+ !Context->ST->hasFlatLgkmVMemCountInOrder()) {
+ // If there is a pending FLAT operation, and this is a VMem or LGKM
+ // waitcnt and the target can report early completion, then we need
+ // to force a waitcnt 0.
+ addWait(Wait, T, 0);
+ } else if (counterOutOfOrder(T)) {
+ // Counter can get decremented out-of-order when there
+ // are multiple types event in the bracket. Also emit an s_wait counter
+ // with a conservative value of 0 for the counter.
+ addWait(Wait, T, 0);
+ } else {
+ // If a counter has been maxed out avoid overflow by waiting for
+ // MAX(CounterType) - 1 instead.
+ unsigned NeededWait = std::min(
+ UB - ScoreToWait, getWaitCountMax(Context->getLimits(), T) - 1);
+ addWait(Wait, T, NeededWait);
}
}
}
+AMDGPU::Waitcnt WaitcntBrackets::determineAsyncWait(unsigned N) {
+ LLVM_DEBUG({
+ dbgs() << "Need " << N << " async marks. Found " << AsyncMarks.size()
+ << ":\n";
+ for (const auto &Mark : AsyncMarks) {
+ llvm::interleaveComma(Mark, dbgs());
+ dbgs() << '\n';
+ }
+ });
+
+ AMDGPU::Waitcnt Wait;
+ if (AsyncMarks.size() == MaxAsyncMarks) {
+ // Enforcing MaxAsyncMarks here is unnecessary work because the size of
+ // MaxAsyncMarks is linear when traversing straightline code. But we do
+ // need to check if truncation may have occured at a merge, and adjust N
+ // to ensure that a wait is generated.
+ LLVM_DEBUG(dbgs() << "Possible truncation. Ensuring a non-trivial wait.\n");
+ N = std::min(N, (unsigned)MaxAsyncMarks - 1);
+ }
+
+ if (AsyncMarks.size() <= N) {
+ LLVM_DEBUG(dbgs() << "No additional wait for async mark.\n");
+ return Wait;
+ }
+
+ size_t MarkIndex = AsyncMarks.size() - N - 1;
+ const auto &RequiredMark = AsyncMarks[MarkIndex];
+ for (InstCounterType T : inst_counter_types())
+ determineWaitForScore(T, RequiredMark[T], Wait);
+
+ // Immediately remove the waited mark and all older ones
+ // This happens BEFORE the wait is actually inserted, which is fine
+ // because we've already extracted the wait requirements
+ LLVM_DEBUG({
+ dbgs() << "Removing " << (MarkIndex + 1)
+ << " async marks after determining wait\n";
+ });
+ AsyncMarks.erase(AsyncMarks.begin(), AsyncMarks.begin() + MarkIndex + 1);
+
+ LLVM_DEBUG(dbgs() << "Waits to add: " << Wait);
+ return Wait;
+}
+
+void WaitcntBrackets::determineWaitForPhysReg(InstCounterType T, MCPhysReg Reg,
+ AMDGPU::Waitcnt &Wait) const {
+ if (Reg == AMDGPU::SCC) {
+ determineWaitForScore(T, SCCScore, Wait);
+ } else {
+ bool IsVGPR = Context->TRI->isVectorRegister(*Context->MRI, Reg);
+ for (MCRegUnit RU : regunits(Reg))
+ determineWaitForScore(
+ T, IsVGPR ? getVMemScore(toVMEMID(RU), T) : getSGPRScore(RU, T),
+ Wait);
+ }
+}
+
+void WaitcntBrackets::determineWaitForLDSDMA(InstCounterType T, VMEMID TID,
+ AMDGPU::Waitcnt &Wait) const {
+ assert(TID >= LDSDMA_BEGIN && TID < LDSDMA_END);
+ determineWaitForScore(T, getVMemScore(TID, T), Wait);
+}
+
void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
// S_BARRIER_WAIT on the same barrier guarantees that the pending write to
// SCC has landed
if (PendingSCCWrite &&
PendingSCCWrite->getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
PendingSCCWrite->getOperand(0).getImm() == Inst->getOperand(0).getImm()) {
- unsigned SCC_WRITE_PendingEvent = 1 << SCC_WRITE;
+ WaitEventSet SCC_WRITE_PendingEvent(SCC_WRITE);
// If this SCC_WRITE is the only pending KM_CNT event, clear counter.
- if ((PendingEvents & Context->WaitEventMaskForInst[KM_CNT]) ==
+ if ((PendingEvents & Context->getWaitEvents(KM_CNT)) ==
SCC_WRITE_PendingEvent) {
setScoreLB(KM_CNT, getScoreUB(KM_CNT));
}
- PendingEvents &= ~SCC_WRITE_PendingEvent;
+ PendingEvents.remove(SCC_WRITE_PendingEvent);
PendingSCCWrite = nullptr;
}
}
@@ -1270,7 +1617,9 @@ void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
applyWaitcnt(BVH_CNT, Wait.BvhCnt);
applyWaitcnt(KM_CNT, Wait.KmCnt);
- applyXcnt(Wait);
+ applyWaitcnt(X_CNT, Wait.XCnt);
+ applyWaitcnt(VA_VDST, Wait.VaVdst);
+ applyWaitcnt(VM_VSRC, Wait.VmVsrc);
}
void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
@@ -1283,25 +1632,22 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
setScoreLB(T, std::max(getScoreLB(T), UB - Count));
} else {
setScoreLB(T, UB);
- PendingEvents &= ~Context->WaitEventMaskForInst[T];
+ PendingEvents.remove(Context->getWaitEvents(T));
}
-}
-
-void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
- // Wait on XCNT is redundant if we are already waiting for a load to complete.
- // SMEM can return out of order, so only omit XCNT wait if we are waiting till
- // zero.
- if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP))
- return applyWaitcnt(X_CNT, 0);
-
- // If we have pending store we cannot optimize XCnt because we do not wait for
- // stores. VMEM loads retun in order, so if we only have loads XCnt is
- // decremented to the same number as LOADCnt.
- if (Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
- !hasPendingEvent(STORE_CNT))
- return applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
- applyWaitcnt(X_CNT, Wait.XCnt);
+ if (T == KM_CNT && Count == 0 && hasPendingEvent(SMEM_GROUP)) {
+ if (!hasMixedPendingEvents(X_CNT))
+ applyWaitcnt(X_CNT, 0);
+ else
+ PendingEvents.remove(SMEM_GROUP);
+ }
+ if (T == LOAD_CNT && hasPendingEvent(VMEM_GROUP) &&
+ !hasPendingEvent(STORE_CNT)) {
+ if (!hasMixedPendingEvents(X_CNT))
+ applyWaitcnt(X_CNT, Count);
+ else if (Count == 0)
+ PendingEvents.remove(VMEM_GROUP);
+ }
}
// Where there are multiple types of event in the bracket of a counter,
@@ -1311,6 +1657,20 @@ bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
if ((T == Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
(T == X_CNT && hasPendingEvent(SMEM_GROUP)))
return true;
+
+ // GLOBAL_INV completes in-order with other LOAD_CNT events (VMEM_ACCESS),
+ // so having GLOBAL_INV_ACCESS mixed with other LOAD_CNT events doesn't cause
+ // out-of-order completion.
+ if (T == LOAD_CNT) {
+ unsigned Events = hasPendingEvent(T);
+ // Remove GLOBAL_INV_ACCESS from the event mask before checking for mixed
+ // events
+ Events &= ~(1 << GLOBAL_INV_ACCESS);
+ // Return true only if there are still multiple event types after removing
+ // GLOBAL_INV
+ return Events & (Events - 1);
+ }
+
return hasMixedPendingEvents(T);
}
@@ -1373,7 +1733,7 @@ bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
if (Opcode == Waitcnt->getOpcode())
return false;
- Waitcnt->setDesc(TII->get(Opcode));
+ Waitcnt->setDesc(TII.get(Opcode));
return true;
}
@@ -1385,7 +1745,6 @@ bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
- assert(ST);
assert(isNormalMode(MaxCounter));
bool Modified = false;
@@ -1394,7 +1753,7 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
LLVM_DEBUG({
dbgs() << "PreGFX12::applyPreexistingWaitcnt at: ";
- if (It == OldWaitcntInstr.getParent()->instr_end())
+ if (It.isEnd())
dbgs() << "end of block\n";
else
dbgs() << *It;
@@ -1427,11 +1786,11 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
} else
WaitcntInstr = &II;
} else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
- assert(ST->hasVMemToLDSLoad());
+ assert(ST.hasVMemToLDSLoad());
LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_lds_direct: " << II
- << "Before: " << Wait.LoadCnt << '\n';);
- ScoreBrackets.determineWait(LOAD_CNT, FIRST_LDS_VGPR, Wait);
- LLVM_DEBUG(dbgs() << "After: " << Wait.LoadCnt << '\n';);
+ << "Before: " << Wait << '\n';);
+ ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, LDSDMA_BEGIN, Wait);
+ LLVM_DEBUG(dbgs() << "After: " << Wait << '\n';);
// It is possible (but unlikely) that this is the only wait instruction,
// in which case, we exit this loop without a WaitcntInstr to consume
@@ -1440,12 +1799,17 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
// possibility in an articial MIR test since such a situation cannot be
// recreated by running the memory legalizer.
II.eraseFromParent();
+ } else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
+ unsigned N = II.getOperand(0).getImm();
+ LLVM_DEBUG(dbgs() << "Processing WAIT_ASYNCMARK: " << II << '\n';);
+ AMDGPU::Waitcnt OldWait = ScoreBrackets.determineAsyncWait(N);
+ Wait = Wait.combined(OldWait);
} else {
assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
unsigned OldVSCnt =
- TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
+ TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
if (TrySimplify)
ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt);
@@ -1470,13 +1834,12 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
Wait.ExpCnt = ~0u;
Wait.DsCnt = ~0u;
- LLVM_DEBUG(It == WaitcntInstr->getParent()->end()
- ? dbgs()
- << "applied pre-existing waitcnt\n"
- << "New Instr at block end: " << *WaitcntInstr << '\n'
- : dbgs() << "applied pre-existing waitcnt\n"
- << "Old Instr: " << *It
- << "New Instr: " << *WaitcntInstr << '\n');
+ LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
+ << "New Instr at block end: "
+ << *WaitcntInstr << '\n'
+ : dbgs() << "applied pre-existing waitcnt\n"
+ << "Old Instr: " << *It
+ << "New Instr: " << *WaitcntInstr << '\n');
}
if (WaitcntVsCntInstr) {
@@ -1487,7 +1850,7 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
Wait.StoreCnt = ~0u;
- LLVM_DEBUG(It == WaitcntVsCntInstr->getParent()->end()
+ LLVM_DEBUG(It.isEnd()
? dbgs() << "applied pre-existing waitcnt\n"
<< "New Instr at block end: " << *WaitcntVsCntInstr
<< '\n'
@@ -1503,38 +1866,100 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
/// required counters in \p Wait
bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
- AMDGPU::Waitcnt Wait) {
- assert(ST);
+ AMDGPU::Waitcnt Wait, const WaitcntBrackets &ScoreBrackets) {
assert(isNormalMode(MaxCounter));
bool Modified = false;
const DebugLoc &DL = Block.findDebugLoc(It);
+ // Helper to emit expanded waitcnt sequence for profiling.
+ // Emits waitcnts from (Outstanding-1) down to Target.
+ // The EmitWaitcnt callback emits a single waitcnt.
+ auto EmitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target,
+ auto EmitWaitcnt) {
+ do {
+ EmitWaitcnt(--Outstanding);
+ } while (Outstanding > Target);
+ Modified = true;
+ };
+
// Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
// single instruction while VScnt has its own instruction.
if (Wait.hasWaitExceptStoreCnt()) {
- unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
- [[maybe_unused]] auto SWaitInst =
- BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
- Modified = true;
+ // If profiling expansion is enabled, emit an expanded sequence
+ if (ExpandWaitcntProfiling) {
+ // Check if any of the counters to be waited on are out-of-order.
+ // If so, fall back to normal (non-expanded) behavior since expansion
+ // would provide misleading profiling information.
+ bool AnyOutOfOrder = false;
+ for (auto CT : {LOAD_CNT, DS_CNT, EXP_CNT}) {
+ unsigned WaitCnt = Wait.get(CT);
+ if (WaitCnt != ~0u && ScoreBrackets.counterOutOfOrder(CT)) {
+ AnyOutOfOrder = true;
+ break;
+ }
+ }
- LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
- if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
- dbgs() << "New Instr: " << *SWaitInst << '\n');
+ if (AnyOutOfOrder) {
+ // Fall back to non-expanded wait
+ unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+ BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT)).addImm(Enc);
+ Modified = true;
+ } else {
+ // All counters are in-order, safe to expand
+ for (auto CT : {LOAD_CNT, DS_CNT, EXP_CNT}) {
+ unsigned WaitCnt = Wait.get(CT);
+ if (WaitCnt == ~0u)
+ continue;
+
+ unsigned Outstanding = std::min(ScoreBrackets.getOutstanding(CT),
+ getWaitCountMax(getLimits(), CT) - 1);
+ EmitExpandedWaitcnt(Outstanding, WaitCnt, [&](unsigned Count) {
+ AMDGPU::Waitcnt W;
+ W.set(CT, Count);
+ BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT))
+ .addImm(AMDGPU::encodeWaitcnt(IV, W));
+ });
+ }
+ }
+ } else {
+ // Normal behavior: emit single combined waitcnt
+ unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+ [[maybe_unused]] auto SWaitInst =
+ BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT)).addImm(Enc);
+ Modified = true;
+
+ LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
+ if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
+ dbgs() << "New Instr: " << *SWaitInst << '\n');
+ }
}
if (Wait.hasWaitStoreCnt()) {
- assert(ST->hasVscnt());
-
- [[maybe_unused]] auto SWaitInst =
- BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
+ assert(ST.hasVscnt());
+
+ if (ExpandWaitcntProfiling && Wait.StoreCnt != ~0u &&
+ !ScoreBrackets.counterOutOfOrder(STORE_CNT)) {
+ // Only expand if counter is not out-of-order
+ unsigned Outstanding =
+ std::min(ScoreBrackets.getOutstanding(STORE_CNT),
+ getWaitCountMax(getLimits(), STORE_CNT) - 1);
+ EmitExpandedWaitcnt(Outstanding, Wait.StoreCnt, [&](unsigned Count) {
+ BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_VSCNT))
.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
- .addImm(Wait.StoreCnt);
- Modified = true;
+ .addImm(Count);
+ });
+ } else {
+ [[maybe_unused]] auto SWaitInst =
+ BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_VSCNT))
+ .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+ .addImm(Wait.StoreCnt);
+ Modified = true;
- LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
- if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
- dbgs() << "New Instr: " << *SWaitInst << '\n');
+ LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
+ if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
+ dbgs() << "New Instr: " << *SWaitInst << '\n');
+ }
}
return Modified;
@@ -1542,13 +1967,14 @@ bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
AMDGPU::Waitcnt
WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
- return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST->hasVscnt() ? 0 : ~0u);
+ return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST.hasVscnt() ? 0 : ~0u);
}
AMDGPU::Waitcnt
WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
+ unsigned ExpertVal = IsExpertMode ? 0 : ~0u;
return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
- ~0u /* XCNT */);
+ ~0u /* XCNT */, ExpertVal, ExpertVal);
}
/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
@@ -1558,22 +1984,25 @@ WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
- assert(ST);
assert(!isNormalMode(MaxCounter));
bool Modified = false;
MachineInstr *CombinedLoadDsCntInstr = nullptr;
MachineInstr *CombinedStoreDsCntInstr = nullptr;
+ MachineInstr *WaitcntDepctrInstr = nullptr;
MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
LLVM_DEBUG({
dbgs() << "GFX12Plus::applyPreexistingWaitcnt at: ";
- if (It == OldWaitcntInstr.getParent()->instr_end())
+ if (It.isEnd())
dbgs() << "end of block\n";
else
dbgs() << *It;
});
+ // Accumulate waits that should not be simplified.
+ AMDGPU::Waitcnt RequiredWait;
+
for (auto &II :
make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
@@ -1597,45 +2026,81 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
unsigned OldEnc =
- TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
+ TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(IV, OldEnc);
if (TrySimplify)
- ScoreBrackets.simplifyWaitcnt(OldWait);
- Wait = Wait.combined(OldWait);
+ Wait = Wait.combined(OldWait);
+ else
+ RequiredWait = RequiredWait.combined(OldWait);
UpdatableInstr = &CombinedLoadDsCntInstr;
} else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
unsigned OldEnc =
- TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
+ TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(IV, OldEnc);
if (TrySimplify)
+ Wait = Wait.combined(OldWait);
+ else
+ RequiredWait = RequiredWait.combined(OldWait);
+ UpdatableInstr = &CombinedStoreDsCntInstr;
+ } else if (Opcode == AMDGPU::S_WAITCNT_DEPCTR) {
+ unsigned OldEnc =
+ TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
+ AMDGPU::Waitcnt OldWait;
+ OldWait.VaVdst = AMDGPU::DepCtr::decodeFieldVaVdst(OldEnc);
+ OldWait.VmVsrc = AMDGPU::DepCtr::decodeFieldVmVsrc(OldEnc);
+ if (TrySimplify)
ScoreBrackets.simplifyWaitcnt(OldWait);
Wait = Wait.combined(OldWait);
- UpdatableInstr = &CombinedStoreDsCntInstr;
+ UpdatableInstr = &WaitcntDepctrInstr;
} else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
// Architectures higher than GFX10 do not have direct loads to
// LDS, so no work required here yet.
II.eraseFromParent();
continue;
+ } else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
+ reportFatalUsageError("WAIT_ASYNCMARK is not ready for GFX12 yet");
} else {
std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
assert(CT.has_value());
unsigned OldCnt =
- TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
+ TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
if (TrySimplify)
- ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt);
- addWait(Wait, CT.value(), OldCnt);
+ addWait(Wait, CT.value(), OldCnt);
+ else
+ addWait(RequiredWait, CT.value(), OldCnt);
UpdatableInstr = &WaitInstrs[CT.value()];
}
// Merge consecutive waitcnt of the same type by erasing multiples.
if (!*UpdatableInstr) {
*UpdatableInstr = &II;
+ } else if (Opcode == AMDGPU::S_WAITCNT_DEPCTR) {
+ // S_WAITCNT_DEPCTR requires special care. Don't remove a
+ // duplicate if it is waiting on things other than VA_VDST or
+ // VM_VSRC. If that is the case, just make sure the VA_VDST and
+ // VM_VSRC subfields of the operand are set to the "no wait"
+ // values.
+
+ unsigned Enc = TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
+ Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Enc, ~0u);
+ Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Enc, ~0u);
+
+ if (Enc != (unsigned)AMDGPU::DepCtr::getDefaultDepCtrEncoding(ST)) {
+ Modified |= updateOperandIfDifferent(II, AMDGPU::OpName::simm16, Enc);
+ Modified |= promoteSoftWaitCnt(&II);
+ } else {
+ II.eraseFromParent();
+ Modified = true;
+ }
} else {
II.eraseFromParent();
Modified = true;
}
}
+ ScoreBrackets.simplifyWaitcnt(Wait.combined(RequiredWait), Wait);
+ Wait = Wait.combined(RequiredWait);
+
if (CombinedLoadDsCntInstr) {
// Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
// to be waited for. Otherwise, let the instruction be deleted so
@@ -1644,6 +2109,11 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
// createNewWaitcnt(). As a side effect, resetting the wait counts will
// cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by
// the loop below that deals with single counter instructions.
+ //
+ // A wait for LOAD_CNT or DS_CNT implies a wait for VM_VSRC, since
+ // instructions that have decremented LOAD_CNT or DS_CNT on completion
+ // will have needed to wait for their register sources to be available
+ // first.
if (Wait.LoadCnt != ~0u && Wait.DsCnt != ~0u) {
unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr,
@@ -1654,13 +2124,12 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
Wait.LoadCnt = ~0u;
Wait.DsCnt = ~0u;
- LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
- ? dbgs() << "applied pre-existing waitcnt\n"
- << "New Instr at block end: "
- << *CombinedLoadDsCntInstr << '\n'
- : dbgs() << "applied pre-existing waitcnt\n"
- << "Old Instr: " << *It << "New Instr: "
- << *CombinedLoadDsCntInstr << '\n');
+ LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
+ << "New Instr at block end: "
+ << *CombinedLoadDsCntInstr << '\n'
+ : dbgs() << "applied pre-existing waitcnt\n"
+ << "Old Instr: " << *It << "New Instr: "
+ << *CombinedLoadDsCntInstr << '\n');
} else {
CombinedLoadDsCntInstr->eraseFromParent();
Modified = true;
@@ -1679,13 +2148,12 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
Wait.StoreCnt = ~0u;
Wait.DsCnt = ~0u;
- LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
- ? dbgs() << "applied pre-existing waitcnt\n"
- << "New Instr at block end: "
- << *CombinedStoreDsCntInstr << '\n'
- : dbgs() << "applied pre-existing waitcnt\n"
- << "Old Instr: " << *It << "New Instr: "
- << *CombinedStoreDsCntInstr << '\n');
+ LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
+ << "New Instr at block end: "
+ << *CombinedStoreDsCntInstr << '\n'
+ : dbgs() << "applied pre-existing waitcnt\n"
+ << "Old Instr: " << *It << "New Instr: "
+ << *CombinedStoreDsCntInstr << '\n');
} else {
CombinedStoreDsCntInstr->eraseFromParent();
Modified = true;
@@ -1729,7 +2197,7 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
if (!WaitInstrs[CT])
continue;
- unsigned NewCnt = getWait(Wait, CT);
+ unsigned NewCnt = Wait.get(CT);
if (NewCnt != ~0u) {
Modified |= updateOperandIfDifferent(*WaitInstrs[CT],
AMDGPU::OpName::simm16, NewCnt);
@@ -1738,7 +2206,7 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
ScoreBrackets.applyWaitcnt(CT, NewCnt);
setNoWait(Wait, CT);
- LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
+ LLVM_DEBUG(It.isEnd()
? dbgs() << "applied pre-existing waitcnt\n"
<< "New Instr at block end: " << *WaitInstrs[CT]
<< '\n'
@@ -1751,19 +2219,86 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
}
}
+ if (WaitcntDepctrInstr) {
+ // Get the encoded Depctr immediate and override the VA_VDST and VM_VSRC
+ // subfields with the new required values.
+ unsigned Enc =
+ TII.getNamedOperand(*WaitcntDepctrInstr, AMDGPU::OpName::simm16)
+ ->getImm();
+ Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Enc, Wait.VmVsrc);
+ Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Enc, Wait.VaVdst);
+
+ ScoreBrackets.applyWaitcnt(VA_VDST, Wait.VaVdst);
+ ScoreBrackets.applyWaitcnt(VM_VSRC, Wait.VmVsrc);
+ Wait.VaVdst = ~0u;
+ Wait.VmVsrc = ~0u;
+
+ // If that new encoded Depctr immediate would actually still wait
+ // for anything, update the instruction's operand. Otherwise it can
+ // just be deleted.
+ if (Enc != (unsigned)AMDGPU::DepCtr::getDefaultDepCtrEncoding(ST)) {
+ Modified |= updateOperandIfDifferent(*WaitcntDepctrInstr,
+ AMDGPU::OpName::simm16, Enc);
+ LLVM_DEBUG(It.isEnd() ? dbgs() << "applyPreexistingWaitcnt\n"
+ << "New Instr at block end: "
+ << *WaitcntDepctrInstr << '\n'
+ : dbgs() << "applyPreexistingWaitcnt\n"
+ << "Old Instr: " << *It << "New Instr: "
+ << *WaitcntDepctrInstr << '\n');
+ } else {
+ WaitcntDepctrInstr->eraseFromParent();
+ Modified = true;
+ }
+ }
+
return Modified;
}
/// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
- AMDGPU::Waitcnt Wait) {
- assert(ST);
+ AMDGPU::Waitcnt Wait, const WaitcntBrackets &ScoreBrackets) {
assert(!isNormalMode(MaxCounter));
bool Modified = false;
const DebugLoc &DL = Block.findDebugLoc(It);
+ // Helper to emit expanded waitcnt sequence for profiling.
+ auto EmitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target,
+ auto EmitWaitcnt) {
+ for (unsigned I = Outstanding - 1; I > Target && I != ~0u; --I)
+ EmitWaitcnt(I);
+ EmitWaitcnt(Target);
+ Modified = true;
+ };
+
+ // For GFX12+, we use separate wait instructions, which makes expansion
+ // simpler
+ if (ExpandWaitcntProfiling) {
+ for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
+ unsigned Count = Wait.get(CT);
+ if (Count == ~0u)
+ continue;
+
+ // Skip expansion for out-of-order counters - emit normal wait instead
+ if (ScoreBrackets.counterOutOfOrder(CT)) {
+ BuildMI(Block, It, DL, TII.get(instrsForExtendedCounterTypes[CT]))
+ .addImm(Count);
+ Modified = true;
+ continue;
+ }
+
+ unsigned Outstanding = std::min(ScoreBrackets.getOutstanding(CT),
+ getWaitCountMax(getLimits(), CT) - 1);
+ EmitExpandedWaitcnt(Outstanding, Count, [&](unsigned Val) {
+ BuildMI(Block, It, DL, TII.get(instrsForExtendedCounterTypes[CT]))
+ .addImm(Val);
+ });
+ }
+ return Modified;
+ }
+
+ // Normal behavior (no expansion)
// Check for opportunities to use combined wait instructions.
if (Wait.DsCnt != ~0u) {
MachineInstr *SWaitInst = nullptr;
@@ -1771,7 +2306,7 @@ bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
if (Wait.LoadCnt != ~0u) {
unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
- SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
+ SWaitInst = BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
.addImm(Enc);
Wait.LoadCnt = ~0u;
@@ -1779,9 +2314,8 @@ bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
} else if (Wait.StoreCnt != ~0u) {
unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait);
- SWaitInst =
- BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_STORECNT_DSCNT))
- .addImm(Enc);
+ SWaitInst = BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAIT_STORECNT_DSCNT))
+ .addImm(Enc);
Wait.StoreCnt = ~0u;
Wait.DsCnt = ~0u;
@@ -1790,7 +2324,7 @@ bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
if (SWaitInst) {
Modified = true;
- LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
+ LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";
if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
dbgs() << "New Instr: " << *SWaitInst << '\n');
}
@@ -1800,16 +2334,31 @@ bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
// waiting for.
for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
- unsigned Count = getWait(Wait, CT);
+ unsigned Count = Wait.get(CT);
if (Count == ~0u)
continue;
[[maybe_unused]] auto SWaitInst =
- BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
+ BuildMI(Block, It, DL, TII.get(instrsForExtendedCounterTypes[CT]))
.addImm(Count);
Modified = true;
+ LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";
+ if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
+ dbgs() << "New Instr: " << *SWaitInst << '\n');
+ }
+
+ if (Wait.hasWaitDepctr()) {
+ assert(IsExpertMode);
+ unsigned Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Wait.VmVsrc, ST);
+ Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Enc, Wait.VaVdst);
+
+ [[maybe_unused]] auto SWaitInst =
+ BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_DEPCTR)).addImm(Enc);
+
+ Modified = true;
+
LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
dbgs() << "New Instr: " << *SWaitInst << '\n');
@@ -1818,19 +2367,6 @@ bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
return Modified;
}
-/// \returns true if the callee inserts an s_waitcnt 0 on function entry.
-static bool callWaitsOnFunctionEntry(const MachineInstr &MI) {
- // Currently all conventions wait, but this may not always be the case.
- //
- // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
- // senses to omit the wait and do it in the caller.
- return true;
-}
-
-/// \returns true if the callee is expected to wait for any outstanding waits
-/// before returning.
-static bool callWaitsOnFunctionReturn(const MachineInstr &MI) { return true; }
-
/// Generate s_waitcnt instruction to be placed before cur_Inst.
/// Instructions of a given type are returned in order,
/// but instructions of different types can complete out of order.
@@ -1841,12 +2377,13 @@ static bool callWaitsOnFunctionReturn(const MachineInstr &MI) { return true; }
/// and if so what the value of each counter is.
/// The "score bracket" is bound by the lower bound and upper bound
/// scores (*_score_LB and *_score_ub respectively).
-/// If FlushVmCnt is true, that means that we want to generate a s_waitcnt to
-/// flush the vmcnt counter here.
-bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
- WaitcntBrackets &ScoreBrackets,
- MachineInstr *OldWaitcntInstr,
- bool FlushVmCnt) {
+/// If FlushFlags.FlushVmCnt is true, we want to flush the vmcnt counter here.
+/// If FlushFlags.FlushDsCnt is true, we want to flush the dscnt counter here
+/// (GFX12+ only, where DS_CNT is a separate counter).
+bool SIInsertWaitcnts::generateWaitcntInstBefore(
+ MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
+ MachineInstr *OldWaitcntInstr, PreheaderFlushFlags FlushFlags) {
+ LLVM_DEBUG(dbgs() << "\n*** GenerateWaitcntInstBefore: "; MI.print(dbgs()););
setForceEmitWaitcnt();
assert(!MI.isMetaInstruction());
@@ -1854,54 +2391,70 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
AMDGPU::Waitcnt Wait;
const unsigned Opc = MI.getOpcode();
- // FIXME: This should have already been handled by the memory legalizer.
- // Removing this currently doesn't affect any lit tests, but we need to
- // verify that nothing was relying on this. The number of buffer invalidates
- // being handled here should not be expanded.
- if (Opc == AMDGPU::BUFFER_WBINVL1 || Opc == AMDGPU::BUFFER_WBINVL1_SC ||
- Opc == AMDGPU::BUFFER_WBINVL1_VOL || Opc == AMDGPU::BUFFER_GL0_INV ||
- Opc == AMDGPU::BUFFER_GL1_INV) {
- Wait.LoadCnt = 0;
- }
-
- // All waits must be resolved at call return.
- // NOTE: this could be improved with knowledge of all call sites or
- // with knowledge of the called routines.
- if (Opc == AMDGPU::SI_RETURN_TO_EPILOG || Opc == AMDGPU::SI_RETURN ||
- Opc == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN ||
- Opc == AMDGPU::S_SETPC_B64_return ||
- (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
- Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
- }
- // In dynamic VGPR mode, we want to release the VGPRs before the wave exits.
- // Technically the hardware will do this on its own if we don't, but that
- // might cost extra cycles compared to doing it explicitly.
- // When not in dynamic VGPR mode, identify S_ENDPGM instructions which may
- // have to wait for outstanding VMEM stores. In this case it can be useful to
- // send a message to explicitly release all VGPRs before the stores have
- // completed, but it is only safe to do this if there are no outstanding
- // scratch stores.
- else if (Opc == AMDGPU::S_ENDPGM || Opc == AMDGPU::S_ENDPGM_SAVED) {
- if (!WCG->isOptNone() &&
- (MI.getMF()->getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() ||
- (ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
- ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
- !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))))
- ReleaseVGPRInsts.insert(&MI);
- }
- // Resolve vm waits before gs-done.
- else if ((Opc == AMDGPU::S_SENDMSG || Opc == AMDGPU::S_SENDMSGHALT) &&
- ST->hasLegacyGeometry() &&
- ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
- AMDGPU::SendMsg::ID_GS_DONE_PreGFX11)) {
+ switch (Opc) {
+ case AMDGPU::BUFFER_WBINVL1:
+ case AMDGPU::BUFFER_WBINVL1_SC:
+ case AMDGPU::BUFFER_WBINVL1_VOL:
+ case AMDGPU::BUFFER_GL0_INV:
+ case AMDGPU::BUFFER_GL1_INV: {
+ // FIXME: This should have already been handled by the memory legalizer.
+ // Removing this currently doesn't affect any lit tests, but we need to
+ // verify that nothing was relying on this. The number of buffer invalidates
+ // being handled here should not be expanded.
Wait.LoadCnt = 0;
+ break;
+ }
+ case AMDGPU::SI_RETURN_TO_EPILOG:
+ case AMDGPU::SI_RETURN:
+ case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN:
+ case AMDGPU::S_SETPC_B64_return: {
+ // All waits must be resolved at call return.
+ // NOTE: this could be improved with knowledge of all call sites or
+ // with knowledge of the called routines.
+ ReturnInsts.insert(&MI);
+ AMDGPU::Waitcnt AllZeroWait =
+ WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
+ // On GFX12+, if LOAD_CNT is pending but no VGPRs are waiting for loads
+ // (e.g., only GLOBAL_INV is pending), we can skip waiting on loadcnt.
+ // GLOBAL_INV increments loadcnt but doesn't write to VGPRs, so there's
+ // no need to wait for it at function boundaries.
+ if (ST->hasExtendedWaitCounts() &&
+ !ScoreBrackets.hasPendingEvent(VMEM_ACCESS))
+ AllZeroWait.LoadCnt = ~0u;
+ Wait = AllZeroWait;
+ break;
+ }
+ case AMDGPU::S_ENDPGM:
+ case AMDGPU::S_ENDPGM_SAVED: {
+ // In dynamic VGPR mode, we want to release the VGPRs before the wave exits.
+ // Technically the hardware will do this on its own if we don't, but that
+ // might cost extra cycles compared to doing it explicitly.
+ // When not in dynamic VGPR mode, identify S_ENDPGM instructions which may
+ // have to wait for outstanding VMEM stores. In this case it can be useful
+ // to send a message to explicitly release all VGPRs before the stores have
+ // completed, but it is only safe to do this if there are no outstanding
+ // scratch stores.
+ EndPgmInsts[&MI] = !ScoreBrackets.empty(STORE_CNT) &&
+ !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS);
+ break;
+ }
+ case AMDGPU::S_SENDMSG:
+ case AMDGPU::S_SENDMSGHALT: {
+ if (ST->hasLegacyGeometry() &&
+ ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
+ AMDGPU::SendMsg::ID_GS_DONE_PreGFX11)) {
+ // Resolve vm waits before gs-done.
+ Wait.LoadCnt = 0;
+ break;
+ }
+ [[fallthrough]];
}
+ default: {
- // Export & GDS instructions do not read the EXEC mask until after the export
- // is granted (which can occur well after the instruction is issued).
- // The shader program must flush all EXP operations on the export-count
- // before overwriting the EXEC mask.
- else {
+ // Export & GDS instructions do not read the EXEC mask until after the
+ // export is granted (which can occur well after the instruction is issued).
+ // The shader program must flush all EXP operations on the export-count
+ // before overwriting the EXEC mask.
if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
// Export and GDS are tracked individually, either may trigger a waitcnt
// for EXEC.
@@ -1918,27 +2471,22 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
if (TII->isAlwaysGDS(Opc) && ScoreBrackets.hasPendingGDS())
addWait(Wait, DS_CNT, ScoreBrackets.getPendingGDSWait());
- if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
+ if (MI.isCall()) {
// The function is going to insert a wait on everything in its prolog.
// This still needs to be careful if the call target is a load (e.g. a GOT
// load). We also need to check WAW dependency with saved PC.
+ CallInsts.insert(&MI);
Wait = AMDGPU::Waitcnt();
- const auto &CallAddrOp = *TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+ const MachineOperand &CallAddrOp = TII->getCalleeOperand(MI);
if (CallAddrOp.isReg()) {
- RegInterval CallAddrOpInterval =
- ScoreBrackets.getRegInterval(&MI, CallAddrOp);
-
- ScoreBrackets.determineWait(SmemAccessCounter, CallAddrOpInterval,
- Wait);
+ ScoreBrackets.determineWaitForPhysReg(
+ SmemAccessCounter, CallAddrOp.getReg().asMCReg(), Wait);
if (const auto *RtnAddrOp =
TII->getNamedOperand(MI, AMDGPU::OpName::dst)) {
- RegInterval RtnAddrOpInterval =
- ScoreBrackets.getRegInterval(&MI, *RtnAddrOp);
-
- ScoreBrackets.determineWait(SmemAccessCounter, RtnAddrOpInterval,
- Wait);
+ ScoreBrackets.determineWaitForPhysReg(
+ SmemAccessCounter, RtnAddrOp->getReg().asMCReg(), Wait);
}
}
} else if (Opc == AMDGPU::S_BARRIER_WAIT) {
@@ -1975,18 +2523,26 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
continue;
// LOAD_CNT is only relevant to vgpr or LDS.
- unsigned RegNo = FIRST_LDS_VGPR;
+ unsigned TID = LDSDMA_BEGIN;
if (Ptr && Memop->getAAInfo()) {
const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
- if (MI.mayAlias(AA, *LDSDMAStores[I], true))
- ScoreBrackets.determineWait(LOAD_CNT, RegNo + I + 1, Wait);
+ if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {
+ if ((I + 1) >= NUM_LDSDMA) {
+ // We didn't have enough slot to track this LDS DMA store, it
+ // has been tracked using the common RegNo (FIRST_LDS_VGPR).
+ ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID, Wait);
+ break;
+ }
+
+ ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID + I + 1, Wait);
+ }
}
} else {
- ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
+ ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID, Wait);
}
if (Memop->isStore()) {
- ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
+ ScoreBrackets.determineWaitForLDSDMA(EXP_CNT, TID, Wait);
}
}
@@ -1999,7 +2555,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI))
continue;
- RegInterval Interval = ScoreBrackets.getRegInterval(&MI, Op);
+ MCPhysReg Reg = Op.getReg().asMCReg();
const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
if (IsVGPR) {
@@ -2011,6 +2567,9 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
if (Op.isImplicit() && MI.mayLoadOrStore())
continue;
+ ScoreBrackets.determineWaitForPhysReg(VA_VDST, Reg, Wait);
+ if (Op.isDef())
+ ScoreBrackets.determineWaitForPhysReg(VM_VSRC, Reg, Wait);
// RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
// previous write and this write are the same type of VMEM
// instruction, in which case they are (in some architectures)
@@ -2018,31 +2577,31 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
// Additionally check instructions where Point Sample Acceleration
// might be applied.
if (Op.isUse() || !updateVMCntOnly(MI) ||
- ScoreBrackets.hasOtherPendingVmemTypes(Interval,
- getVmemType(MI)) ||
- ScoreBrackets.hasPointSamplePendingVmemTypes(MI, Interval) ||
+ ScoreBrackets.hasOtherPendingVmemTypes(Reg, getVmemType(MI)) ||
+ ScoreBrackets.hasPointSamplePendingVmemTypes(MI, Reg) ||
!ST->hasVmemWriteVgprInOrder()) {
- ScoreBrackets.determineWait(LOAD_CNT, Interval, Wait);
- ScoreBrackets.determineWait(SAMPLE_CNT, Interval, Wait);
- ScoreBrackets.determineWait(BVH_CNT, Interval, Wait);
- ScoreBrackets.clearVgprVmemTypes(Interval);
+ ScoreBrackets.determineWaitForPhysReg(LOAD_CNT, Reg, Wait);
+ ScoreBrackets.determineWaitForPhysReg(SAMPLE_CNT, Reg, Wait);
+ ScoreBrackets.determineWaitForPhysReg(BVH_CNT, Reg, Wait);
+ ScoreBrackets.clearVgprVmemTypes(Reg);
}
if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
- ScoreBrackets.determineWait(EXP_CNT, Interval, Wait);
+ ScoreBrackets.determineWaitForPhysReg(EXP_CNT, Reg, Wait);
}
- ScoreBrackets.determineWait(DS_CNT, Interval, Wait);
+ ScoreBrackets.determineWaitForPhysReg(DS_CNT, Reg, Wait);
} else if (Op.getReg() == AMDGPU::SCC) {
- ScoreBrackets.determineWait(KM_CNT, Interval, Wait);
+ ScoreBrackets.determineWaitForPhysReg(KM_CNT, Reg, Wait);
} else {
- ScoreBrackets.determineWait(SmemAccessCounter, Interval, Wait);
+ ScoreBrackets.determineWaitForPhysReg(SmemAccessCounter, Reg, Wait);
}
- if (ST->hasWaitXCnt() && Op.isDef())
- ScoreBrackets.determineWait(X_CNT, Interval, Wait);
+ if (ST->hasWaitXcnt() && Op.isDef())
+ ScoreBrackets.determineWaitForPhysReg(X_CNT, Reg, Wait);
}
}
}
+ }
// Ensure safety against exceptions from outstanding memory operations while
// waiting for a barrier:
@@ -2057,7 +2616,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
// In all other cases, ensure safety by ensuring that there are no outstanding
// memory operations.
if (Opc == AMDGPU::S_BARRIER && !ST->hasAutoWaitcntBeforeBarrier() &&
- !ST->supportsBackOffBarrier()) {
+ !ST->hasBackOffBarrier()) {
Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
}
@@ -2072,35 +2631,41 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
// Verify that the wait is actually needed.
ScoreBrackets.simplifyWaitcnt(Wait);
+ // It is only necessary to insert an S_WAITCNT_DEPCTR instruction that
+ // waits on VA_VDST if the instruction it would precede is not a VALU
+ // instruction, since hardware handles VALU->VGPR->VALU hazards in
+ // expert scheduling mode.
+ if (TII->isVALU(MI))
+ Wait.VaVdst = ~0u;
+
+ // Since the translation for VMEM addresses occur in-order, we can apply the
+ // XCnt if the current instruction is of VMEM type and has a memory
+ // dependency with another VMEM instruction in flight.
+ if (Wait.XCnt != ~0u && isVmemAccess(MI)) {
+ ScoreBrackets.applyWaitcnt(X_CNT, Wait.XCnt);
+ Wait.XCnt = ~0u;
+ }
+
// When forcing emit, we need to skip terminators because that would break the
// terminators of the MBB if we emit a waitcnt between terminators.
if (ForceEmitZeroFlag && !MI.isTerminator())
Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
- if (ForceEmitWaitcnt[LOAD_CNT])
- Wait.LoadCnt = 0;
- if (ForceEmitWaitcnt[EXP_CNT])
- Wait.ExpCnt = 0;
- if (ForceEmitWaitcnt[DS_CNT])
- Wait.DsCnt = 0;
- if (ForceEmitWaitcnt[SAMPLE_CNT])
- Wait.SampleCnt = 0;
- if (ForceEmitWaitcnt[BVH_CNT])
- Wait.BvhCnt = 0;
- if (ForceEmitWaitcnt[KM_CNT])
- Wait.KmCnt = 0;
- if (ForceEmitWaitcnt[X_CNT])
- Wait.XCnt = 0;
-
- if (FlushVmCnt) {
- if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
- Wait.LoadCnt = 0;
- if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
- Wait.SampleCnt = 0;
- if (ScoreBrackets.hasPendingEvent(BVH_CNT))
- Wait.BvhCnt = 0;
+ // If we force waitcnt then update Wait accordingly.
+ for (InstCounterType T : inst_counter_types()) {
+ if (!ForceEmitWaitcnt[T])
+ continue;
+ Wait.set(T, 0);
+ }
+
+ if (FlushFlags.FlushVmCnt) {
+ for (InstCounterType T : {LOAD_CNT, SAMPLE_CNT, BVH_CNT})
+ Wait.set(T, 0);
}
+ if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(DS_CNT))
+ Wait.DsCnt = 0;
+
if (ForceEmitZeroLoadFlag && Wait.LoadCnt != ~0u)
Wait.LoadCnt = 0;
@@ -2121,10 +2686,6 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
Modified =
WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
- // Any counts that could have been applied to any existing waitcnt
- // instructions will have been done so, now deal with any remaining.
- ScoreBrackets.applyWaitcnt(Wait);
-
// ExpCnt can be merged into VINTERP.
if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
SIInstrInfo::isVINTERP(*It)) {
@@ -2134,31 +2695,59 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
WaitExp->setImm(Wait.ExpCnt);
Modified = true;
}
+ // Apply ExpCnt before resetting it, so applyWaitcnt below sees all counts.
+ ScoreBrackets.applyWaitcnt(EXP_CNT, Wait.ExpCnt);
Wait.ExpCnt = ~0u;
LLVM_DEBUG(dbgs() << "generateWaitcnt\n"
<< "Update Instr: " << *It);
}
- // XCnt may be already consumed by a load wait.
- if (Wait.XCnt != ~0u) {
- if (Wait.KmCnt == 0 && !ScoreBrackets.hasPendingEvent(SMEM_GROUP))
- Wait.XCnt = ~0u;
+ if (WCG->createNewWaitcnt(Block, It, Wait, ScoreBrackets))
+ Modified = true;
- if (Wait.LoadCnt == 0 && !ScoreBrackets.hasPendingEvent(VMEM_GROUP))
- Wait.XCnt = ~0u;
+ // Any counts that could have been applied to any existing waitcnt
+ // instructions will have been done so, now deal with any remaining.
+ ScoreBrackets.applyWaitcnt(Wait);
- // Since the translation for VMEM addresses occur in-order, we can skip the
- // XCnt if the current instruction is of VMEM type and has a memory
- // dependency with another VMEM instruction in flight.
- if (isVmemAccess(*It))
- Wait.XCnt = ~0u;
+ return Modified;
+}
+
+std::optional<WaitEventType>
+SIInsertWaitcnts::getExpertSchedulingEventType(const MachineInstr &Inst) const {
+ if (TII->isVALU(Inst)) {
+ // Core/Side-, DP-, XDL- and TRANS-MACC VALU instructions complete
+ // out-of-order with respect to each other, so each of these classes
+ // has its own event.
+
+ if (TII->isXDL(Inst))
+ return VGPR_XDL_WRITE;
+
+ if (TII->isTRANS(Inst))
+ return VGPR_TRANS_WRITE;
+
+ if (AMDGPU::isDPMACCInstruction(Inst.getOpcode()))
+ return VGPR_DPMACC_WRITE;
+
+ return VGPR_CSMACC_WRITE;
}
- if (WCG->createNewWaitcnt(Block, It, Wait))
- Modified = true;
+ // FLAT and LDS instructions may read their VGPR sources out-of-order
+ // with respect to each other and all other VMEM instructions, so
+ // each of these also has a separate event.
- return Modified;
+ if (TII->isFLAT(Inst))
+ return VGPR_FLAT_READ;
+
+ if (TII->isDS(Inst))
+ return VGPR_LDS_READ;
+
+ if (TII->isVMEM(Inst) || TII->isVIMAGE(Inst) || TII->isVSAMPLE(Inst))
+ return VGPR_VMEM_READ;
+
+ // Otherwise, no hazard.
+
+ return {};
}
bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
@@ -2235,6 +2824,12 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
bool IsVMEMAccess = false;
bool IsSMEMAccess = false;
+
+ if (IsExpertMode) {
+ if (const auto ET = getExpertSchedulingEventType(Inst))
+ ScoreBrackets->updateByEvent(*ET, Inst);
+ }
+
if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
if (TII->isAlwaysGDS(Inst.getOpcode()) ||
TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
@@ -2265,13 +2860,19 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
ScoreBrackets->updateByEvent(LDS_ACCESS, Inst);
}
- // This is a flat memory operation that access both VMEM and LDS, so note it
- // - it will require that both the VM and LGKM be flushed to zero if it is
- // pending when a VM or LGKM dependency occurs.
- if (FlatASCount > 1)
+ // Async/LDSDMA operations have FLAT encoding but do not actually use flat
+ // pointers. They do have two operands that each access global and LDS, thus
+ // making it appear at this point that they are using a flat pointer. Filter
+ // them out, and for the rest, generate a dependency on flat pointers so
+ // that both VM and LGKM counters are flushed.
+ if (!SIInstrInfo::isLDSDMA(Inst) && FlatASCount > 1)
ScoreBrackets->setPendingFlat();
} else if (SIInstrInfo::isVMEM(Inst) &&
- !llvm::AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode())) {
+ (!AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode()) ||
+ Inst.getOpcode() == AMDGPU::BUFFER_WBL2)) {
+ // BUFFER_WBL2 is included here because unlike invalidates, has to be
+ // followed "S_WAITCNT vmcnt(0)" is needed after to ensure the writeback has
+ // completed.
IsVMEMAccess = true;
ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);
@@ -2283,15 +2884,9 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
IsSMEMAccess = true;
ScoreBrackets->updateByEvent(SMEM_ACCESS, Inst);
} else if (Inst.isCall()) {
- if (callWaitsOnFunctionReturn(Inst)) {
- // Act as a wait on everything
- ScoreBrackets->applyWaitcnt(
- WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
- ScoreBrackets->setStateOnFunctionEntryOrReturn();
- } else {
- // May need to way wait for anything.
- ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
- }
+ // Act as a wait on everything
+ ScoreBrackets->applyWaitcnt(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
+ ScoreBrackets->setStateOnFunctionEntryOrReturn();
} else if (SIInstrInfo::isLDSDIR(Inst)) {
ScoreBrackets->updateByEvent(EXP_LDS_ACCESS, Inst);
} else if (TII->isVINTERP(Inst)) {
@@ -2324,7 +2919,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
}
}
- if (!ST->hasWaitXCnt())
+ if (!ST->hasWaitXcnt())
return;
if (IsVMEMAccess)
@@ -2343,6 +2938,84 @@ bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
return OtherShifted > MyShifted;
}
+bool WaitcntBrackets::mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
+ ArrayRef<CounterValueArray> OtherMarks) {
+ bool StrictDom = false;
+
+ LLVM_DEBUG(dbgs() << "Merging async marks ...");
+ // Early exit: both empty
+ if (AsyncMarks.empty() && OtherMarks.empty()) {
+ LLVM_DEBUG(dbgs() << " nothing to merge\n");
+ return false;
+ }
+ LLVM_DEBUG(dbgs() << '\n');
+
+ // Determine maximum length needed after merging
+ auto MaxSize = (unsigned)std::max(AsyncMarks.size(), OtherMarks.size());
+
+ // For each backedge in isolation, the algorithm reachs a fixed point after
+ // the first call to merge(). This is unchanged even with the AsyncMarks
+ // array because we call mergeScore just like the other cases.
+ //
+ // But in the rare pathological case, a nest of loops that pushes marks
+ // without waiting on any mark can cause AsyncMarks to grow very large. We cap
+ // it to a reasonable limit. We can tune this later or potentially introduce a
+ // user option to control the value.
+ MaxSize = std::min(MaxSize, MaxAsyncMarks);
+
+ // Keep only the most recent marks within our limit.
+ if (AsyncMarks.size() > MaxSize)
+ AsyncMarks.erase(AsyncMarks.begin(),
+ AsyncMarks.begin() + (AsyncMarks.size() - MaxSize));
+
+ // Pad with zero-filled marks if our list is shorter. Zero represents "no
+ // pending async operations at this checkpoint" and acts as the identity
+ // element for max() during merging. We pad at the beginning since the marks
+ // need to be aligned in most-recent order.
+ CounterValueArray ZeroMark{};
+ AsyncMarks.insert(AsyncMarks.begin(), MaxSize - AsyncMarks.size(), ZeroMark);
+
+ LLVM_DEBUG({
+ dbgs() << "Before merge:\n";
+ for (const auto &Mark : AsyncMarks) {
+ llvm::interleaveComma(Mark, dbgs());
+ dbgs() << '\n';
+ }
+ });
+
+ LLVM_DEBUG({
+ dbgs() << "Other marks:\n";
+ for (const auto &Mark : OtherMarks) {
+ llvm::interleaveComma(Mark, dbgs());
+ dbgs() << '\n';
+ }
+ });
+
+ // Merge element-wise using the existing mergeScore function and the
+ // appropriate MergeInfo for each counter type. Iterate only while we have
+ // elements in both vectors.
+ unsigned OtherSize = OtherMarks.size();
+ unsigned OurSize = AsyncMarks.size();
+ unsigned MergeCount = std::min(OtherSize, OurSize);
+ assert(OurSize == MaxSize);
+ for (unsigned Idx = 1; Idx <= MergeCount; ++Idx) {
+ for (auto T : inst_counter_types(Context->MaxCounter)) {
+ StrictDom |= mergeScore(MergeInfos[T], AsyncMarks[OurSize - Idx][T],
+ OtherMarks[OtherSize - Idx][T]);
+ }
+ }
+
+ LLVM_DEBUG({
+ dbgs() << "After merge:\n";
+ for (const auto &Mark : AsyncMarks) {
+ llvm::interleaveComma(Mark, dbgs());
+ dbgs() << '\n';
+ }
+ });
+
+ return StrictDom;
+}
+
/// Merge the pending events and associater score brackets of \p Other into
/// this brackets status.
///
@@ -2351,15 +3024,22 @@ bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
bool StrictDom = false;
- VgprUB = std::max(VgprUB, Other.VgprUB);
- SgprUB = std::max(SgprUB, Other.SgprUB);
+ // Check if "other" has keys we don't have, and create default entries for
+ // those. If they remain empty after merging, we will clean it up after.
+ for (auto K : Other.VMem.keys())
+ VMem.try_emplace(K);
+ for (auto K : Other.SGPRs.keys())
+ SGPRs.try_emplace(K);
+
+ // Array to store MergeInfo for each counter type
+ MergeInfo MergeInfos[NUM_INST_CNTS];
for (auto T : inst_counter_types(Context->MaxCounter)) {
// Merge event flags for this counter
- const unsigned *WaitEventMaskForInst = Context->WaitEventMaskForInst;
- const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
- const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
- if (OtherEvents & ~OldEvents)
+ const WaitEventSet &EventsForT = Context->getWaitEvents(T);
+ const WaitEventSet OldEvents = PendingEvents & EventsForT;
+ const WaitEventSet OtherEvents = Other.PendingEvents & EventsForT;
+ if (!OldEvents.contains(OtherEvents))
StrictDom = true;
PendingEvents |= OtherEvents;
@@ -2370,7 +3050,7 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
if (NewUB < ScoreLBs[T])
report_fatal_error("waitcnt score overflow");
- MergeInfo M;
+ MergeInfo &M = MergeInfos[T];
M.OldLB = ScoreLBs[T];
M.OtherLB = Other.ScoreLBs[T];
M.MyShift = NewUB - ScoreUBs[T];
@@ -2386,8 +3066,7 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
if (T == KM_CNT) {
StrictDom |= mergeScore(M, SCCScore, Other.SCCScore);
if (Other.hasPendingEvent(SCC_WRITE)) {
- unsigned OldEventsHasSCCWrite = OldEvents & (1 << SCC_WRITE);
- if (!OldEventsHasSCCWrite) {
+ if (!OldEvents.contains(SCC_WRITE)) {
PendingSCCWrite = Other.PendingSCCWrite;
} else if (PendingSCCWrite != Other.PendingSCCWrite) {
PendingSCCWrite = nullptr;
@@ -2395,23 +3074,33 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
}
}
- for (int J = 0; J <= VgprUB; J++)
- StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
+ for (auto &[RegID, Info] : VMem)
+ StrictDom |= mergeScore(M, Info.Scores[T], Other.getVMemScore(RegID, T));
if (isSmemCounter(T)) {
unsigned Idx = getSgprScoresIdx(T);
- for (int J = 0; J <= SgprUB; J++)
- StrictDom |=
- mergeScore(M, SgprScores[Idx][J], Other.SgprScores[Idx][J]);
+ for (auto &[RegID, Info] : SGPRs) {
+ auto It = Other.SGPRs.find(RegID);
+ unsigned OtherScore =
+ (It != Other.SGPRs.end()) ? It->second.Scores[Idx] : 0;
+ StrictDom |= mergeScore(M, Info.Scores[Idx], OtherScore);
+ }
}
}
- for (int J = 0; J <= VgprUB; J++) {
- unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J];
- StrictDom |= NewVmemTypes != VgprVmemTypes[J];
- VgprVmemTypes[J] = NewVmemTypes;
+ for (auto &[TID, Info] : VMem) {
+ if (auto It = Other.VMem.find(TID); It != Other.VMem.end()) {
+ unsigned char NewVmemTypes = Info.VMEMTypes | It->second.VMEMTypes;
+ StrictDom |= NewVmemTypes != Info.VMEMTypes;
+ Info.VMEMTypes = NewVmemTypes;
+ }
}
+ StrictDom |= mergeAsyncMarks(MergeInfos, Other.AsyncMarks);
+ for (auto T : inst_counter_types(Context->MaxCounter))
+ StrictDom |= mergeScore(MergeInfos[T], AsyncScore[T], Other.AsyncScore[T]);
+
+ purgeEmptyTrackingData();
return StrictDom;
}
@@ -2423,9 +3112,53 @@ static bool isWaitInstr(MachineInstr &Inst) {
Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
Opcode == AMDGPU::S_WAITCNT_lds_direct ||
+ Opcode == AMDGPU::WAIT_ASYNCMARK ||
counterTypeForInstr(Opcode).has_value();
}
+void SIInsertWaitcnts::setSchedulingMode(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ bool ExpertMode) const {
+ const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
+ AMDGPU::Hwreg::ID_SCHED_MODE, AMDGPU::Hwreg::HwregOffset::Default, 2);
+ BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_SETREG_IMM32_B32))
+ .addImm(ExpertMode ? 2 : 0)
+ .addImm(EncodedReg);
+}
+
+// Track back-to-back atomic RMW instructions, referred to as a block.
+//
+// Determines whether \p MI starts a new atomic RMW block, is inside
+// an existing block, or is outside of a block. A block is broken when a
+// CU-scoped memory op or an atomic store is encountered. ALU ops
+// and non-memory instructions don't break a block. The function returns
+// the new state after processing the current instruction based on
+// \p PrevState, the previously captured state.
+AtomicRMWState
+SIInsertWaitcnts::getAtomicRMWState(MachineInstr &MI,
+ AtomicRMWState PrevState) const {
+ if (isAtomicRMW(MI)) {
+ // Transition from NotInBlock -> NewBlock -> InsideBlock.
+ if (PrevState == AtomicRMWState::NotInBlock)
+ return AtomicRMWState::NewBlock;
+ if (PrevState == AtomicRMWState::NewBlock)
+ return AtomicRMWState::InsideBlock;
+
+ return PrevState;
+ }
+
+ // LDS memory operations don't break the block.
+ if (TII->isDS(MI) || (TII->isFLAT(MI) && TII->mayAccessLDSThroughFlat(MI)))
+ return PrevState;
+
+ // Reset the atomic RMW block state when found other VMEM and SMEM operations.
+ if (MI.mayLoad() ^ MI.mayStore())
+ return AtomicRMWState::NotInBlock;
+
+ // Return the previous state otherwise.
+ return PrevState;
+}
+
// Generate s_waitcnt instructions where needed.
bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
MachineBasicBlock &Block,
@@ -2454,6 +3187,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
// Walk over the instructions.
MachineInstr *OldWaitcntInstr = nullptr;
+ AtomicRMWState RMWState = AtomicRMWState::NotInBlock;
for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
E = Block.instr_end();
@@ -2463,22 +3197,50 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
++Iter;
continue;
}
+ // Get the atomic RMW block state for current instruction.
+ RMWState = getAtomicRMWState(Inst, RMWState);
// Track pre-existing waitcnts that were added in earlier iterations or by
// the memory legalizer.
- if (isWaitInstr(Inst)) {
- if (!OldWaitcntInstr)
- OldWaitcntInstr = &Inst;
+ if (isWaitInstr(Inst) ||
+ (IsExpertMode && Inst.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR)) {
++Iter;
+ bool IsSoftXcnt = isSoftXcnt(Inst);
+ // The Memory Legalizer conservatively inserts a soft xcnt before each
+ // atomic RMW operation. However, for sequences of back-to-back atomic
+ // RMWs, only the first s_wait_xcnt insertion is necessary. Optimize away
+ // the redundant soft xcnts when we're inside an atomic RMW block.
+ if (Iter != E && IsSoftXcnt) {
+ // Check if the next instruction can potentially change the atomic RMW
+ // state.
+ RMWState = getAtomicRMWState(*Iter, RMWState);
+ }
+
+ if (IsSoftXcnt && RMWState == AtomicRMWState::InsideBlock) {
+ // Delete this soft xcnt.
+ Inst.eraseFromParent();
+ Modified = true;
+ } else if (!OldWaitcntInstr) {
+ OldWaitcntInstr = &Inst;
+ }
continue;
}
- bool FlushVmCnt = Block.getFirstTerminator() == Inst &&
- isPreheaderToFlush(Block, ScoreBrackets);
+ PreheaderFlushFlags FlushFlags;
+ if (Block.getFirstTerminator() == Inst)
+ FlushFlags = isPreheaderToFlush(Block, ScoreBrackets);
+
+ if (Inst.getOpcode() == AMDGPU::ASYNCMARK) {
+ // FIXME: Not supported on GFX12 yet. Will need a new feature when we do.
+ assert(ST->getGeneration() < AMDGPUSubtarget::GFX12);
+ ScoreBrackets.recordAsyncMark(Inst);
+ ++Iter;
+ continue;
+ }
// Generate an s_waitcnt instruction to be placed before Inst, if needed.
Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
- FlushVmCnt);
+ FlushFlags);
OldWaitcntInstr = nullptr;
// Restore vccz if it's not known to be correct already.
@@ -2552,17 +3314,21 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
++Iter;
}
- // Flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the end of the block if
- // needed.
+ // Flush counters at the end of the block if needed (for preheaders with no
+ // terminator).
AMDGPU::Waitcnt Wait;
- if (Block.getFirstTerminator() == Block.end() &&
- isPreheaderToFlush(Block, ScoreBrackets)) {
- if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
- Wait.LoadCnt = 0;
- if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
- Wait.SampleCnt = 0;
- if (ScoreBrackets.hasPendingEvent(BVH_CNT))
- Wait.BvhCnt = 0;
+ if (Block.getFirstTerminator() == Block.end()) {
+ PreheaderFlushFlags FlushFlags = isPreheaderToFlush(Block, ScoreBrackets);
+ if (FlushFlags.FlushVmCnt) {
+ if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
+ Wait.LoadCnt = 0;
+ if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
+ Wait.SampleCnt = 0;
+ if (ScoreBrackets.hasPendingEvent(BVH_CNT))
+ Wait.BvhCnt = 0;
+ }
+ if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(DS_CNT))
+ Wait.DsCnt = 0;
}
// Combine or remove any redundant waitcnts at the end of the block.
@@ -2578,29 +3344,29 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
return Modified;
}
-// Return true if the given machine basic block is a preheader of a loop in
-// which we want to flush the vmcnt counter, and false otherwise.
-bool SIInsertWaitcnts::isPreheaderToFlush(
- MachineBasicBlock &MBB, const WaitcntBrackets &ScoreBrackets) {
- auto [Iterator, IsInserted] = PreheadersToFlush.try_emplace(&MBB, false);
+// Return flags indicating which counters should be flushed in the preheader.
+PreheaderFlushFlags
+SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
+ const WaitcntBrackets &ScoreBrackets) {
+ auto [Iterator, IsInserted] =
+ PreheadersToFlush.try_emplace(&MBB, PreheaderFlushFlags());
if (!IsInserted)
return Iterator->second;
MachineBasicBlock *Succ = MBB.getSingleSuccessor();
if (!Succ)
- return false;
+ return PreheaderFlushFlags();
MachineLoop *Loop = MLI->getLoopFor(Succ);
if (!Loop)
- return false;
+ return PreheaderFlushFlags();
- if (Loop->getLoopPreheader() == &MBB &&
- shouldFlushVmCnt(Loop, ScoreBrackets)) {
- Iterator->second = true;
- return true;
+ if (Loop->getLoopPreheader() == &MBB) {
+ Iterator->second = getPreheaderFlushFlags(Loop, ScoreBrackets);
+ return Iterator->second;
}
- return false;
+ return PreheaderFlushFlags();
}
bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
@@ -2609,72 +3375,152 @@ bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
return SIInstrInfo::isVMEM(MI);
}
-// Return true if it is better to flush the vmcnt counter in the preheader of
-// the given loop. We currently decide to flush in two situations:
+bool SIInsertWaitcnts::isDSRead(const MachineInstr &MI) const {
+ return SIInstrInfo::isDS(MI) && MI.mayLoad() && !MI.mayStore();
+}
+
+// Check if instruction is a store to LDS that is counted via DSCNT
+// (where that counter exists).
+bool SIInsertWaitcnts::mayStoreIncrementingDSCNT(const MachineInstr &MI) const {
+ if (!MI.mayStore())
+ return false;
+ if (SIInstrInfo::isDS(MI))
+ return true;
+ return false;
+}
+
+// Return flags indicating which counters should be flushed in the preheader of
+// the given loop. We currently decide to flush in a few situations:
+// For VMEM (FlushVmCnt):
// 1. The loop contains vmem store(s), no vmem load and at least one use of a
// vgpr containing a value that is loaded outside of the loop. (Only on
// targets with no vscnt counter).
// 2. The loop contains vmem load(s), but the loaded values are not used in the
// loop, and at least one use of a vgpr containing a value that is loaded
// outside of the loop.
-bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
- const WaitcntBrackets &Brackets) {
+// For DS (FlushDsCnt, GFX12+ only):
+// 3. The loop contains no DS reads, and at least one use of a vgpr containing
+// a value that is DS loaded outside of the loop.
+// 4. The loop contains DS read(s), loaded values are not used in the same
+// iteration but in the next iteration (prefetch pattern), and at least one
+// use of a vgpr containing a value that is DS loaded outside of the loop.
+// Flushing in preheader reduces wait overhead if the wait requirement in
+// iteration 1 would otherwise be more strict.
+PreheaderFlushFlags
+SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *ML,
+ const WaitcntBrackets &Brackets) {
+ PreheaderFlushFlags Flags;
bool HasVMemLoad = false;
bool HasVMemStore = false;
- bool UsesVgprLoadedOutside = false;
- DenseSet<Register> VgprUse;
- DenseSet<Register> VgprDef;
+ bool SeenDSStoreInLoop = false;
+ bool UsesVgprLoadedOutsideVMEM = false;
+ bool UsesVgprLoadedOutsideDS = false;
+ bool VMemInvalidated = false;
+ // DS optimization only applies to GFX12+ where DS_CNT is separate.
+ bool DSInvalidated = !ST->hasExtendedWaitCounts();
+ DenseSet<MCRegUnit> VgprUse;
+ DenseSet<MCRegUnit> VgprDefVMEM;
+ DenseSet<MCRegUnit> VgprDefDS;
for (MachineBasicBlock *MBB : ML->blocks()) {
+ bool SeenDSStoreInCurrMBB = false;
for (MachineInstr &MI : *MBB) {
if (isVMEMOrFlatVMEM(MI)) {
HasVMemLoad |= MI.mayLoad();
HasVMemStore |= MI.mayStore();
}
-
+ if (mayStoreIncrementingDSCNT(MI))
+ SeenDSStoreInCurrMBB = true;
+ // Stores postdominated by a barrier will have a wait at the barrier
+ // and thus no need to be waited at the loop header. Barrier found
+ // later in the same MBB during in-order traversal is used here as a
+ // cheaper alternative to postdomination check.
+ if (MI.getOpcode() == AMDGPU::S_BARRIER)
+ SeenDSStoreInCurrMBB = false;
for (const MachineOperand &Op : MI.all_uses()) {
if (Op.isDebug() || !TRI->isVectorRegister(*MRI, Op.getReg()))
continue;
- RegInterval Interval = Brackets.getRegInterval(&MI, Op);
// Vgpr use
- for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+ for (MCRegUnit RU : TRI->regunits(Op.getReg().asMCReg())) {
// If we find a register that is loaded inside the loop, 1. and 2.
- // are invalidated and we can exit.
- if (VgprDef.contains(RegNo))
- return false;
- VgprUse.insert(RegNo);
- // If at least one of Op's registers is in the score brackets, the
- // value is likely loaded outside of the loop.
- if (Brackets.getRegScore(RegNo, LOAD_CNT) >
- Brackets.getScoreLB(LOAD_CNT) ||
- Brackets.getRegScore(RegNo, SAMPLE_CNT) >
- Brackets.getScoreLB(SAMPLE_CNT) ||
- Brackets.getRegScore(RegNo, BVH_CNT) >
- Brackets.getScoreLB(BVH_CNT)) {
- UsesVgprLoadedOutside = true;
- break;
- }
+ // are invalidated.
+ if (VgprDefVMEM.contains(RU))
+ VMemInvalidated = true;
+
+ // Check for DS loads used inside the loop
+ if (VgprDefDS.contains(RU))
+ DSInvalidated = true;
+
+ // Early exit if both optimizations are invalidated
+ if (VMemInvalidated && DSInvalidated)
+ return Flags;
+
+ VgprUse.insert(RU);
+ // Check if this register has a pending VMEM load from outside the
+ // loop (value loaded outside and used inside).
+ VMEMID ID = toVMEMID(RU);
+ if (Brackets.hasPendingVMEM(ID, LOAD_CNT) ||
+ Brackets.hasPendingVMEM(ID, SAMPLE_CNT) ||
+ Brackets.hasPendingVMEM(ID, BVH_CNT))
+ UsesVgprLoadedOutsideVMEM = true;
+ // Check if loaded outside the loop via DS (not VMEM/FLAT).
+ // Only consider it a DS load if there's no pending VMEM load for
+ // this register, since FLAT can set both counters.
+ else if (Brackets.hasPendingVMEM(ID, DS_CNT))
+ UsesVgprLoadedOutsideDS = true;
}
}
// VMem load vgpr def
if (isVMEMOrFlatVMEM(MI) && MI.mayLoad()) {
for (const MachineOperand &Op : MI.all_defs()) {
- RegInterval Interval = Brackets.getRegInterval(&MI, Op);
- for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+ for (MCRegUnit RU : TRI->regunits(Op.getReg().asMCReg())) {
// If we find a register that is loaded inside the loop, 1. and 2.
- // are invalidated and we can exit.
- if (VgprUse.contains(RegNo))
- return false;
- VgprDef.insert(RegNo);
+ // are invalidated.
+ if (VgprUse.contains(RU))
+ VMemInvalidated = true;
+ VgprDefVMEM.insert(RU);
+ }
+ }
+ // Early exit if both optimizations are invalidated
+ if (VMemInvalidated && DSInvalidated)
+ return Flags;
+ }
+
+ // DS read vgpr def
+ // Note: Unlike VMEM, we DON'T invalidate when VgprUse.contains(RegNo).
+ // If USE comes before DEF, it's the prefetch pattern (use value from
+ // previous iteration, load for next iteration). We should still flush
+ // in preheader so iteration 1 doesn't need to wait inside the loop.
+ // Only invalidate when DEF comes before USE (same-iteration consumption,
+ // checked above when processing uses).
+ if (isDSRead(MI)) {
+ for (const MachineOperand &Op : MI.all_defs()) {
+ for (MCRegUnit RU : TRI->regunits(Op.getReg().asMCReg())) {
+ VgprDefDS.insert(RU);
}
}
}
}
+ // Accumulate unprotected DS stores from this MBB
+ SeenDSStoreInLoop |= SeenDSStoreInCurrMBB;
}
- if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
- return true;
- return HasVMemLoad && UsesVgprLoadedOutside && ST->hasVmemWriteVgprInOrder();
+
+ // VMEM flush decision
+ if (!VMemInvalidated && UsesVgprLoadedOutsideVMEM &&
+ ((!ST->hasVscnt() && HasVMemStore && !HasVMemLoad) ||
+ (HasVMemLoad && ST->hasVmemWriteVgprInOrder())))
+ Flags.FlushVmCnt = true;
+
+ // DS flush decision: flush if loop uses DS-loaded values from outside
+ // and either has no DS reads in the loop, or DS reads whose results
+ // are not used in the loop.
+ // DSInvalidated is pre-set to true on non-GFX12+ targets where DS_CNT
+ // is LGKM_CNT which also tracks FLAT/SMEM.
+ if (!DSInvalidated && !SeenDSStoreInLoop && UsesVgprLoadedOutsideDS)
+ Flags.FlushDsCnt = true;
+
+ return Flags;
}
bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {
@@ -2714,48 +3560,36 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU());
+ // Initialize hardware limits first, as they're needed by the generators.
+ Limits = AMDGPU::HardwareLimits(IV);
+
if (ST->hasExtendedWaitCounts()) {
- MaxCounter = NUM_EXTENDED_INST_CNTS;
- WCGGFX12Plus = WaitcntGeneratorGFX12Plus(MF, MaxCounter);
- WCG = &WCGGFX12Plus;
+ IsExpertMode = ST->hasExpertSchedulingMode() &&
+ (ExpertSchedulingModeFlag.getNumOccurrences()
+ ? ExpertSchedulingModeFlag
+ : MF.getFunction()
+ .getFnAttribute("amdgpu-expert-scheduling-mode")
+ .getValueAsBool());
+ MaxCounter = IsExpertMode ? NUM_EXPERT_INST_CNTS : NUM_EXTENDED_INST_CNTS;
+ if (!WCG)
+ WCG = std::make_unique<WaitcntGeneratorGFX12Plus>(MF, MaxCounter, &Limits,
+ IsExpertMode);
} else {
MaxCounter = NUM_NORMAL_INST_CNTS;
- WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF);
- WCG = &WCGPreGFX12;
+ if (!WCG)
+ WCG = std::make_unique<WaitcntGeneratorPreGFX12>(MF, NUM_NORMAL_INST_CNTS,
+ &Limits);
}
for (auto T : inst_counter_types())
ForceEmitWaitcnt[T] = false;
- WaitEventMaskForInst = WCG->getWaitEventMask();
-
- SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
-
- if (ST->hasExtendedWaitCounts()) {
- Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV);
- Limits.DscntMax = AMDGPU::getDscntBitMask(IV);
- } else {
- Limits.LoadcntMax = AMDGPU::getVmcntBitMask(IV);
- Limits.DscntMax = AMDGPU::getLgkmcntBitMask(IV);
- }
- Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
- Limits.StorecntMax = AMDGPU::getStorecntBitMask(IV);
- Limits.SamplecntMax = AMDGPU::getSamplecntBitMask(IV);
- Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV);
- Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV);
- Limits.XcntMax = AMDGPU::getXcntBitMask(IV);
-
- [[maybe_unused]] unsigned NumVGPRsMax =
- ST->getAddressableNumVGPRs(MFI->getDynamicVGPRBlockSize());
- [[maybe_unused]] unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
- assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
- assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
+ SmemAccessCounter = getCounterFromEvent(SMEM_ACCESS);
BlockInfos.clear();
bool Modified = false;
MachineBasicBlock &EntryBB = MF.front();
- MachineBasicBlock::iterator I = EntryBB.begin();
if (!MFI->isEntryFunction()) {
// Wait for any outstanding memory operations that the input registers may
@@ -2764,9 +3598,9 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
// TODO: Could insert earlier and schedule more liberally with operations
// that only use caller preserved registers.
- for (MachineBasicBlock::iterator E = EntryBB.end();
- I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
- ;
+ MachineBasicBlock::iterator I = EntryBB.begin();
+ while (I != EntryBB.end() && I->isMetaInstruction())
+ ++I;
if (ST->hasExtendedWaitCounts()) {
BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
@@ -2783,6 +3617,12 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
TII->get(instrsForExtendedCounterTypes[CT]))
.addImm(0);
}
+ if (IsExpertMode) {
+ unsigned Enc = AMDGPU::DepCtr::encodeFieldVaVdst(0, *ST);
+ Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Enc, 0);
+ BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_DEPCTR))
+ .addImm(Enc);
+ }
} else {
BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
}
@@ -2839,7 +3679,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
if (!SuccBI.Incoming) {
SuccBI.Dirty = true;
if (SuccBII <= BII) {
- LLVM_DEBUG(dbgs() << "repeat on backedge\n");
+ LLVM_DEBUG(dbgs() << "Repeat on backedge without merge\n");
Repeat = true;
}
if (!MoveBracketsToSucc) {
@@ -2847,11 +3687,20 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
} else {
SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
}
- } else if (SuccBI.Incoming->merge(*Brackets)) {
- SuccBI.Dirty = true;
- if (SuccBII <= BII) {
- LLVM_DEBUG(dbgs() << "repeat on backedge\n");
- Repeat = true;
+ } else {
+ LLVM_DEBUG({
+ dbgs() << "Try to merge ";
+ MBB->printName(dbgs());
+ dbgs() << " into ";
+ Succ->printName(dbgs());
+ dbgs() << '\n';
+ });
+ if (SuccBI.Incoming->merge(*Brackets)) {
+ SuccBI.Dirty = true;
+ if (SuccBII <= BII) {
+ LLVM_DEBUG(dbgs() << "Repeat on backedge with merge\n");
+ Repeat = true;
+ }
}
}
}
@@ -2907,26 +3756,49 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
}
}
+ if (IsExpertMode) {
+ // Enable expert scheduling on function entry. To satisfy ABI requirements
+ // and to allow calls between function with different expert scheduling
+ // settings, disable it around calls and before returns.
+
+ MachineBasicBlock::iterator I = EntryBB.begin();
+ while (I != EntryBB.end() && I->isMetaInstruction())
+ ++I;
+ setSchedulingMode(EntryBB, I, true);
+
+ for (MachineInstr *MI : CallInsts) {
+ MachineBasicBlock &MBB = *MI->getParent();
+ setSchedulingMode(MBB, MI, false);
+ setSchedulingMode(MBB, std::next(MI->getIterator()), true);
+ }
+
+ for (MachineInstr *MI : ReturnInsts)
+ setSchedulingMode(*MI->getParent(), MI, false);
+
+ Modified = true;
+ }
+
// Deallocate the VGPRs before previously identified S_ENDPGM instructions.
// This is done in different ways depending on how the VGPRs were allocated
// (i.e. whether we're in dynamic VGPR mode or not).
// Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
// waveslot limited kernel runs slower with the deallocation.
- if (MFI->isDynamicVGPREnabled()) {
- for (MachineInstr *MI : ReleaseVGPRInsts) {
+ if (!WCG->isOptNone() && MFI->isDynamicVGPREnabled()) {
+ for (auto [MI, _] : EndPgmInsts) {
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
TII->get(AMDGPU::S_ALLOC_VGPR))
.addImm(0);
Modified = true;
}
- } else {
- if (!ReleaseVGPRInsts.empty() &&
- (MF.getFrameInfo().hasCalls() ||
- ST->getOccupancyWithNumVGPRs(
- TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass),
- /*IsDynamicVGPR=*/false) <
- AMDGPU::IsaInfo::getMaxWavesPerEU(ST))) {
- for (MachineInstr *MI : ReleaseVGPRInsts) {
+ } else if (!WCG->isOptNone() &&
+ ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
+ (MF.getFrameInfo().hasCalls() ||
+ ST->getOccupancyWithNumVGPRs(
+ TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass),
+ /*IsDynamicVGPR=*/false) <
+ AMDGPU::IsaInfo::getMaxWavesPerEU(ST))) {
+ for (auto [MI, Flag] : EndPgmInsts) {
+ if (Flag) {
if (ST->requiresNopBeforeDeallocVGPRs()) {
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
TII->get(AMDGPU::S_NOP))
@@ -2939,7 +3811,10 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
}
}
}
- ReleaseVGPRInsts.clear();
+
+ CallInsts.clear();
+ ReturnInsts.clear();
+ EndPgmInsts.clear();
PreheadersToFlush.clear();
SLoadAddresses.clear();
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d516330..24aa31a 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -63,7 +63,8 @@ static cl::opt<bool> Fix16BitCopies(
cl::ReallyHidden);
SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST)
- : AMDGPUGenInstrInfo(ST, AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
+ : AMDGPUGenInstrInfo(ST, RI, AMDGPU::ADJCALLSTACKUP,
+ AMDGPU::ADJCALLSTACKDOWN),
RI(ST), ST(ST) {
SchedModel.init(&ST);
}
@@ -161,7 +162,7 @@ bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const {
if (!DstReg.isVirtual())
return true;
- const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
switch (Use.getOpcode()) {
case AMDGPU::S_AND_SAVEEXEC_B32:
@@ -179,6 +180,10 @@ bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const {
return false;
}
+ // If it is not convergent it does not depend on EXEC.
+ if (!MI.isConvergent())
+ return false;
+
switch (MI.getOpcode()) {
default:
break;
@@ -1154,7 +1159,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
}
int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
- int NewOpc;
+ int64_t NewOpc;
// Try to map original to commuted opcode
NewOpc = AMDGPU::getCommuteRev(Opcode);
@@ -1325,7 +1330,8 @@ bool SIInstrInfo::getConstValDefinedInReg(const MachineInstr &MI,
case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
case AMDGPU::S_MOV_B64_IMM_PSEUDO:
- case AMDGPU::V_MOV_B64_PSEUDO: {
+ case AMDGPU::V_MOV_B64_PSEUDO:
+ case AMDGPU::V_MOV_B16_t16_e32: {
const MachineOperand &Src0 = MI.getOperand(1);
if (Src0.isImm()) {
ImmVal = Src0.getImm();
@@ -1334,6 +1340,15 @@ bool SIInstrInfo::getConstValDefinedInReg(const MachineInstr &MI,
return false;
}
+ case AMDGPU::V_MOV_B16_t16_e64: {
+ const MachineOperand &Src0 = MI.getOperand(2);
+ if (Src0.isImm() && !MI.getOperand(1).getImm()) {
+ ImmVal = Src0.getImm();
+ return MI.getOperand(0).getReg() == Reg;
+ }
+
+ return false;
+ }
case AMDGPU::S_BREV_B32:
case AMDGPU::V_BFREV_B32_e32:
case AMDGPU::V_BFREV_B32_e64: {
@@ -1361,6 +1376,24 @@ bool SIInstrInfo::getConstValDefinedInReg(const MachineInstr &MI,
}
}
+std::optional<int64_t>
+SIInstrInfo::getImmOrMaterializedImm(MachineOperand &Op) const {
+ if (Op.isImm())
+ return Op.getImm();
+
+ if (!Op.isReg() || !Op.getReg().isVirtual())
+ return std::nullopt;
+ MachineRegisterInfo &MRI = Op.getParent()->getMF()->getRegInfo();
+ const MachineInstr *Def = MRI.getVRegDef(Op.getReg());
+ if (Def && Def->isMoveImmediate()) {
+ const MachineOperand &ImmSrc = Def->getOperand(1);
+ if (ImmSrc.isImm())
+ return extractSubregFromImm(ImmSrc.getImm(), Op.getSubReg());
+ }
+
+ return std::nullopt;
+}
+
unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
if (RI.isAGPRClass(DstRC))
@@ -1393,6 +1426,10 @@ SIInstrInfo::getIndirectGPRIDXPseudo(unsigned VecSize,
return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
if (VecSize <= 160) // 20 bytes
return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
+ if (VecSize <= 192) // 24 bytes
+ return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6);
+ if (VecSize <= 224) // 28 bytes
+ return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7);
if (VecSize <= 256) // 32 bytes
return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
if (VecSize <= 288) // 36 bytes
@@ -1421,6 +1458,10 @@ SIInstrInfo::getIndirectGPRIDXPseudo(unsigned VecSize,
return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
if (VecSize <= 160) // 20 bytes
return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
+ if (VecSize <= 192) // 24 bytes
+ return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6);
+ if (VecSize <= 224) // 28 bytes
+ return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7);
if (VecSize <= 256) // 32 bytes
return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
if (VecSize <= 288) // 36 bytes
@@ -1450,6 +1491,10 @@ static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
if (VecSize <= 160) // 20 bytes
return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
+ if (VecSize <= 192) // 24 bytes
+ return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6;
+ if (VecSize <= 224) // 28 bytes
+ return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7;
if (VecSize <= 256) // 32 bytes
return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
if (VecSize <= 288) // 36 bytes
@@ -1479,6 +1524,10 @@ static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
if (VecSize <= 160) // 20 bytes
return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
+ if (VecSize <= 192) // 24 bytes
+ return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6;
+ if (VecSize <= 224) // 28 bytes
+ return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7;
if (VecSize <= 256) // 32 bytes
return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
if (VecSize <= 288) // 36 bytes
@@ -1667,8 +1716,7 @@ unsigned SIInstrInfo::getVectorRegSpillSaveOpcode(
void SIInstrInfo::storeRegToStackSlot(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
- bool isKill, int FrameIndex, const TargetRegisterClass *RC,
- const TargetRegisterInfo *TRI, Register VReg,
+ bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
MachineInstr::MIFlag Flags) const {
MachineFunction *MF = MBB.getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
@@ -1680,7 +1728,7 @@ void SIInstrInfo::storeRegToStackSlot(
MachineMemOperand *MMO = MF->getMachineMemOperand(
PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
FrameInfo.getObjectAlign(FrameIndex));
- unsigned SpillSize = TRI->getSpillSize(*RC);
+ unsigned SpillSize = RI.getSpillSize(*RC);
MachineRegisterInfo &MRI = MF->getRegInfo();
if (RI.isSGPRClass(RC)) {
@@ -1862,14 +1910,13 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
Register DestReg, int FrameIndex,
const TargetRegisterClass *RC,
- const TargetRegisterInfo *TRI,
- Register VReg,
+ Register VReg, unsigned SubReg,
MachineInstr::MIFlag Flags) const {
MachineFunction *MF = MBB.getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
MachineFrameInfo &FrameInfo = MF->getFrameInfo();
const DebugLoc &DL = MBB.findDebugLoc(MI);
- unsigned SpillSize = TRI->getSpillSize(*RC);
+ unsigned SpillSize = RI.getSpillSize(*RC);
MachinePointerInfo PtrInfo
= MachinePointerInfo::getFixedStack(*MF, FrameIndex);
@@ -1955,17 +2002,15 @@ MachineBasicBlock *SIInstrInfo::insertSimulatedTrap(MachineRegisterInfo &MRI,
constexpr unsigned ECQueueWaveAbort = 0x400;
MachineBasicBlock *TrapBB = &MBB;
- MachineBasicBlock *ContBB = &MBB;
MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
- ContBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false);
+ MBB.splitAt(MI, /*UpdateLiveIns=*/false);
TrapBB = MF->CreateMachineBasicBlock();
BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
MF->push_back(TrapBB);
MBB.addSuccessor(TrapBB);
}
-
// Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
// will be a nop.
BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
@@ -2001,7 +2046,7 @@ MachineBasicBlock *SIInstrInfo::insertSimulatedTrap(MachineRegisterInfo &MRI,
MF->push_back(HaltLoopBB);
HaltLoopBB->addSuccessor(HaltLoopBB);
- return ContBB;
+ return MBB.getNextNode();
}
unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
@@ -2132,11 +2177,14 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
+ const MCInstrDesc &Mov64Desc = get(AMDGPU::V_MOV_B64_e32);
+ const TargetRegisterClass *Mov64RC = getRegClass(Mov64Desc, /*OpNum=*/0);
+
const MachineOperand &SrcOp = MI.getOperand(1);
// FIXME: Will this work for 64-bit floating point immediates?
assert(!SrcOp.isFPImm());
- if (ST.hasMovB64()) {
- MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
+ if (ST.hasMovB64() && Mov64RC->contains(Dst)) {
+ MI.setDesc(Mov64Desc);
if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals())
break;
@@ -2145,17 +2193,21 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
APInt Imm(64, SrcOp.getImm());
APInt Lo(32, Imm.getLoBits(32).getZExtValue());
APInt Hi(32, Imm.getHiBits(32).getZExtValue());
- if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
- BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
- .addImm(SISrcMods::OP_SEL_1)
- .addImm(Lo.getSExtValue())
- .addImm(SISrcMods::OP_SEL_1)
- .addImm(Lo.getSExtValue())
- .addImm(0) // op_sel_lo
- .addImm(0) // op_sel_hi
- .addImm(0) // neg_lo
- .addImm(0) // neg_hi
- .addImm(0); // clamp
+ const MCInstrDesc &PkMovDesc = get(AMDGPU::V_PK_MOV_B32);
+ const TargetRegisterClass *PkMovRC = getRegClass(PkMovDesc, /*OpNum=*/0);
+
+ if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo) &&
+ PkMovRC->contains(Dst)) {
+ BuildMI(MBB, MI, DL, PkMovDesc, Dst)
+ .addImm(SISrcMods::OP_SEL_1)
+ .addImm(Lo.getSExtValue())
+ .addImm(SISrcMods::OP_SEL_1)
+ .addImm(Lo.getSExtValue())
+ .addImm(0) // op_sel_lo
+ .addImm(0) // op_sel_hi
+ .addImm(0) // neg_lo
+ .addImm(0) // neg_hi
+ .addImm(0); // clamp
} else {
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
.addImm(Lo.getSExtValue())
@@ -2241,6 +2293,8 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
+ case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6:
+ case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7:
case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
@@ -2253,6 +2307,8 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
+ case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6:
+ case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7:
case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
@@ -2282,11 +2338,11 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
assert(VecReg == MI.getOperand(1).getReg());
MachineInstrBuilder MIB =
- BuildMI(MBB, MI, DL, OpDesc)
- .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
- .add(MI.getOperand(2))
- .addReg(VecReg, RegState::ImplicitDefine)
- .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
+ BuildMI(MBB, MI, DL, OpDesc)
+ .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
+ .add(MI.getOperand(2))
+ .addReg(VecReg, RegState::ImplicitDefine)
+ .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
const int ImpDefIdx =
OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
@@ -2300,6 +2356,8 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
+ case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6:
+ case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7:
case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
@@ -2324,8 +2382,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
.addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
.add(MI.getOperand(2))
.addReg(VecReg, RegState::ImplicitDefine)
- .addReg(VecReg,
- RegState::Implicit | (IsUndef ? RegState::Undef : 0));
+ .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
const int ImpDefIdx =
OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
@@ -2344,6 +2401,8 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
+ case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6:
+ case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7:
case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
@@ -2355,18 +2414,17 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
Register Dst = MI.getOperand(0).getReg();
Register VecReg = MI.getOperand(1).getReg();
bool IsUndef = MI.getOperand(1).isUndef();
- Register Idx = MI.getOperand(2).getReg();
Register SubReg = MI.getOperand(3).getImm();
MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
- .addReg(Idx)
+ .add(MI.getOperand(2))
.addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE);
SetOn->getOperand(3).setIsUndef();
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
.addDef(Dst)
.addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
- .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
+ .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
@@ -2500,7 +2558,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
}
break;
- case AMDGPU::V_MAX_BF16_PSEUDO_e64:
+ case AMDGPU::V_MAX_BF16_PSEUDO_e64: {
assert(ST.hasBF16PackedInsts());
MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16));
MI.addOperand(MachineOperand::CreateImm(0)); // op_sel
@@ -2513,13 +2571,46 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
break;
}
+ case AMDGPU::GET_STACK_BASE:
+ // The stack starts at offset 0 unless we need to reserve some space at the
+ // bottom.
+ if (ST.getFrameLowering()->mayReserveScratchForCWSR(*MBB.getParent())) {
+ // When CWSR is used in dynamic VGPR mode, the trap handler needs to save
+ // some of the VGPRs. The size of the required scratch space has already
+ // been computed by prolog epilog insertion.
+ const SIMachineFunctionInfo *MFI =
+ MBB.getParent()->getInfo<SIMachineFunctionInfo>();
+ unsigned VGPRSize = MFI->getScratchReservedForDynamicVGPRs();
+ Register DestReg = MI.getOperand(0).getReg();
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_GETREG_B32), DestReg)
+ .addImm(AMDGPU::Hwreg::HwregEncoding::encode(
+ AMDGPU::Hwreg::ID_HW_ID2, AMDGPU::Hwreg::OFFSET_ME_ID, 2));
+ // The MicroEngine ID is 0 for the graphics queue, and 1 or 2 for compute
+ // (3 is unused, so we ignore it). Unfortunately, S_GETREG doesn't set
+ // SCC, so we need to check for 0 manually.
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)).addImm(0).addReg(DestReg);
+ // Change the implicif-def of SCC to an explicit use (but first remove
+ // the dead flag if present).
+ MI.getOperand(MI.getNumExplicitOperands()).setIsDead(false);
+ MI.getOperand(MI.getNumExplicitOperands()).setIsUse();
+ MI.setDesc(get(AMDGPU::S_CMOVK_I32));
+ MI.addOperand(MachineOperand::CreateImm(VGPRSize));
+ } else {
+ MI.setDesc(get(AMDGPU::S_MOV_B32));
+ MI.addOperand(MachineOperand::CreateImm(0));
+ MI.removeOperand(
+ MI.getNumExplicitOperands()); // Drop implicit def of SCC.
+ }
+ break;
+ }
+
return true;
}
void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I, Register DestReg,
- unsigned SubIdx, const MachineInstr &Orig,
- const TargetRegisterInfo &RI) const {
+ unsigned SubIdx,
+ const MachineInstr &Orig) const {
// Try shrinking the instruction to remat only the part needed for current
// context.
@@ -2569,7 +2660,7 @@ void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB,
const MCInstrDesc &TID = get(NewOpcode);
const TargetRegisterClass *NewRC =
- RI.getAllocatableClass(getRegClass(TID, 0, &RI));
+ RI.getAllocatableClass(getRegClass(TID, 0));
MRI.setRegClass(DestReg, NewRC);
UseMO->setReg(DestReg);
@@ -2599,7 +2690,7 @@ void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB,
break;
}
- TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI);
+ TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig);
}
std::pair<MachineInstr*, MachineInstr*>
@@ -2644,7 +2735,7 @@ SIInstrInfo::expandMovDPP64(MachineInstr &MI) const {
if (Src.isPhysical())
MovDPP.addReg(RI.getSubReg(Src, Sub));
else
- MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
+ MovDPP.addReg(Src, getUndefRegState(SrcOp.isUndef()), Sub);
}
}
@@ -2907,7 +2998,7 @@ void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
auto I = MBB.end();
auto &MCCtx = MF->getContext();
- if (ST.hasAddPC64Inst()) {
+ if (ST.useAddPC64Inst()) {
MCSymbol *Offset =
MCCtx.createTempSymbol("offset", /*AlwaysAddSuffix=*/true);
auto AddPC = BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_PC_I64))
@@ -2935,7 +3026,7 @@ void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
if (FlushSGPRWrites)
BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR))
- .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
+ .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0, ST));
};
// We need to compute the offset relative to the instruction immediately after
@@ -2953,11 +3044,11 @@ void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
.addReg(PCReg, RegState::Define, AMDGPU::sub0)
- .addReg(PCReg, 0, AMDGPU::sub0)
+ .addReg(PCReg, {}, AMDGPU::sub0)
.addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
.addReg(PCReg, RegState::Define, AMDGPU::sub1)
- .addReg(PCReg, 0, AMDGPU::sub1)
+ .addReg(PCReg, {}, AMDGPU::sub1)
.addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
ApplyHazardWorkarounds();
@@ -3377,15 +3468,13 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
MachineInstr *Select;
if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
- Select =
- BuildMI(MBB, I, DL, get(SelOp), DstElt)
- .addReg(FalseReg, 0, SubIdx)
- .addReg(TrueReg, 0, SubIdx);
+ Select = BuildMI(MBB, I, DL, get(SelOp), DstElt)
+ .addReg(FalseReg, {}, SubIdx)
+ .addReg(TrueReg, {}, SubIdx);
} else {
- Select =
- BuildMI(MBB, I, DL, get(SelOp), DstElt)
- .addReg(TrueReg, 0, SubIdx)
- .addReg(FalseReg, 0, SubIdx);
+ Select = BuildMI(MBB, I, DL, get(SelOp), DstElt)
+ .addReg(TrueReg, {}, SubIdx)
+ .addReg(FalseReg, {}, SubIdx);
}
preserveCondRegFlags(Select->getOperand(3), Cond[1]);
@@ -3461,6 +3550,21 @@ void SIInstrInfo::removeModOperands(MachineInstr &MI) const {
}
}
+void SIInstrInfo::mutateAndCleanupImplicit(MachineInstr &MI,
+ const MCInstrDesc &NewDesc) const {
+ MI.setDesc(NewDesc);
+
+ // Remove any leftover implicit operands from mutating the instruction. e.g.
+ // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
+ // anymore.
+ const MCInstrDesc &Desc = MI.getDesc();
+ unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
+ Desc.implicit_defs().size();
+
+ for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
+ MI.removeOperand(I);
+}
+
std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
unsigned SubRegIndex) {
switch (SubRegIndex) {
@@ -3503,6 +3607,8 @@ static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) {
case AMDGPU::V_FMAC_F16_e64:
case AMDGPU::V_FMAC_F16_t16_e64:
case AMDGPU::V_FMAC_F16_fake16_e64:
+ case AMDGPU::V_FMAC_F16_t16_e32:
+ case AMDGPU::V_FMAC_F16_fake16_e32:
case AMDGPU::V_FMA_F16_e64:
return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
? AMDGPU::V_FMAAK_F16_t16
@@ -3535,6 +3641,8 @@ static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
case AMDGPU::V_FMAC_F16_e64:
case AMDGPU::V_FMAC_F16_t16_e64:
case AMDGPU::V_FMAC_F16_fake16_e64:
+ case AMDGPU::V_FMAC_F16_t16_e32:
+ case AMDGPU::V_FMAC_F16_fake16_e32:
case AMDGPU::V_FMA_F16_e64:
return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
? AMDGPU::V_FMAMK_F16_t16
@@ -3612,7 +3720,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
const MCInstrDesc &MovDesc = get(MovOp);
- const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0, &RI);
+ const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0);
if (Is16Bit) {
// We just need to find a correctly sized register class, so the
// subregister index compatibility doesn't matter since we're statically
@@ -3703,6 +3811,23 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
+ auto CopyRegOperandToNarrowerRC =
+ [MRI, this](MachineInstr &MI, unsigned OpNo,
+ const TargetRegisterClass *NewRC) -> void {
+ if (!MI.getOperand(OpNo).isReg())
+ return;
+ Register Reg = MI.getOperand(OpNo).getReg();
+ const TargetRegisterClass *RC = RI.getRegClassForReg(*MRI, Reg);
+ if (RI.getCommonSubClass(RC, NewRC) != NewRC)
+ return;
+ Register Tmp = MRI->createVirtualRegister(NewRC);
+ BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
+ get(AMDGPU::COPY), Tmp)
+ .addReg(Reg);
+ MI.getOperand(OpNo).setReg(Tmp);
+ MI.getOperand(OpNo).setIsKill();
+ };
+
// Multiplied part is the constant: Use v_madmk_{f16, f32}.
if ((Src0->isReg() && Src0->getReg() == Reg) ||
(Src1->isReg() && Src1->getReg() == Reg)) {
@@ -3734,13 +3859,6 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
if (pseudoToMCOpcode(NewOpc) == -1)
return false;
- // V_FMAMK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAMK_F16_fake16
- // takes VGPR_32_Lo128 operands, so the rewrite would also require
- // restricting their register classes. For now just bail out.
- if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
- NewOpc == AMDGPU::V_FMAMK_F16_fake16)
- return false;
-
const std::optional<int64_t> SubRegImm = extractSubregFromImm(
Imm, RegSrc == Src1 ? Src0->getSubReg() : Src1->getSubReg());
@@ -3765,6 +3883,19 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
removeModOperands(UseMI);
UseMI.setDesc(get(NewOpc));
+ if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
+ NewOpc == AMDGPU::V_FMAMK_F16_fake16) {
+ const TargetRegisterClass *NewRC = getRegClass(get(NewOpc), 0);
+ Register Tmp = MRI->createVirtualRegister(NewRC);
+ BuildMI(*UseMI.getParent(), std::next(UseMI.getIterator()),
+ UseMI.getDebugLoc(), get(AMDGPU::COPY),
+ UseMI.getOperand(0).getReg())
+ .addReg(Tmp, RegState::Kill);
+ UseMI.getOperand(0).setReg(Tmp);
+ CopyRegOperandToNarrowerRC(UseMI, 1, NewRC);
+ CopyRegOperandToNarrowerRC(UseMI, 3, NewRC);
+ }
+
bool DeleteDef = MRI->use_nodbg_empty(Reg);
if (DeleteDef)
DefMI.eraseFromParent();
@@ -3812,13 +3943,6 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
if (pseudoToMCOpcode(NewOpc) == -1)
return false;
- // V_FMAAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16
- // takes VGPR_32_Lo128 operands, so the rewrite would also require
- // restricting their register classes. For now just bail out.
- if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
- NewOpc == AMDGPU::V_FMAAK_F16_fake16)
- return false;
-
// FIXME: This would be a lot easier if we could return a new instruction
// instead of having to modify in place.
@@ -3838,6 +3962,20 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
// These come before src2.
removeModOperands(UseMI);
UseMI.setDesc(get(NewOpc));
+
+ if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
+ NewOpc == AMDGPU::V_FMAAK_F16_fake16) {
+ const TargetRegisterClass *NewRC = getRegClass(get(NewOpc), 0);
+ Register Tmp = MRI->createVirtualRegister(NewRC);
+ BuildMI(*UseMI.getParent(), std::next(UseMI.getIterator()),
+ UseMI.getDebugLoc(), get(AMDGPU::COPY),
+ UseMI.getOperand(0).getReg())
+ .addReg(Tmp, RegState::Kill);
+ UseMI.getOperand(0).setReg(Tmp);
+ CopyRegOperandToNarrowerRC(UseMI, 1, NewRC);
+ CopyRegOperandToNarrowerRC(UseMI, 2, NewRC);
+ }
+
// It might happen that UseMI was commuted
// and we now have SGPR as SRC1. If so 2 inlined
// constant and SGPR are illegal.
@@ -3917,6 +4055,9 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
if (isLDSDMA(MIa) || isLDSDMA(MIb))
return false;
+ if (MIa.isBundle() || MIb.isBundle())
+ return false;
+
// TODO: Should we check the address space from the MachineMemOperand? That
// would allow us to distinguish objects we know don't alias based on the
// underlying address space, even if it was lowered to a different one,
@@ -3982,7 +4123,7 @@ static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
MachineInstr **DefMI = nullptr) {
if (!MO->isReg())
return false;
- const MachineFunction *MF = MO->getParent()->getParent()->getParent();
+ const MachineFunction *MF = MO->getParent()->getMF();
const MachineRegisterInfo &MRI = MF->getRegInfo();
return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
}
@@ -4032,28 +4173,50 @@ static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
}
}
+/// Helper struct for the implementation of 3-address conversion to communicate
+/// updates made to instruction operands.
+struct SIInstrInfo::ThreeAddressUpdates {
+ /// Other instruction whose def is no longer used by the converted
+ /// instruction.
+ MachineInstr *RemoveMIUse = nullptr;
+};
+
MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
LiveVariables *LV,
LiveIntervals *LIS) const {
MachineBasicBlock &MBB = *MI.getParent();
- unsigned Opc = MI.getOpcode();
+ MachineInstr *CandidateMI = &MI;
- // Handle MFMA.
- int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
- if (NewMFMAOpc != -1) {
- MachineInstrBuilder MIB =
- BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
- for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
- MIB.add(MI.getOperand(I));
- updateLiveVariables(LV, MI, *MIB);
+ if (MI.isBundle()) {
+ // This is a temporary placeholder for bundle handling that enables us to
+ // exercise the relevant code paths in the two-address instruction pass.
+ if (MI.getBundleSize() != 1)
+ return nullptr;
+ CandidateMI = MI.getNextNode();
+ }
+
+ ThreeAddressUpdates U;
+ MachineInstr *NewMI = convertToThreeAddressImpl(*CandidateMI, U);
+ if (!NewMI)
+ return nullptr;
+
+ if (MI.isBundle()) {
+ CandidateMI->eraseFromBundle();
+
+ for (MachineOperand &MO : MI.all_defs()) {
+ if (MO.isTied())
+ MI.untieRegOperand(MO.getOperandNo());
+ }
+ } else {
+ updateLiveVariables(LV, MI, *NewMI);
if (LIS) {
- LIS->ReplaceMachineInstrInMaps(MI, *MIB);
+ LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
// SlotIndex of defs needs to be updated when converting to early-clobber
- MachineOperand &Def = MIB->getOperand(0);
+ MachineOperand &Def = NewMI->getOperand(0);
if (Def.isEarlyClobber() && Def.isReg() &&
LIS->hasInterval(Def.getReg())) {
- SlotIndex OldIndex = LIS->getInstructionIndex(*MIB).getRegSlot(false);
- SlotIndex NewIndex = LIS->getInstructionIndex(*MIB).getRegSlot(true);
+ SlotIndex OldIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(false);
+ SlotIndex NewIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(true);
auto &LI = LIS->getInterval(Def.getReg());
auto UpdateDefIndex = [&](LiveRange &LR) {
auto *S = LR.find(OldIndex);
@@ -4068,6 +4231,88 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
UpdateDefIndex(SR);
}
}
+ }
+
+ if (U.RemoveMIUse) {
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ // The only user is the instruction which will be killed.
+ Register DefReg = U.RemoveMIUse->getOperand(0).getReg();
+
+ if (MRI.hasOneNonDBGUse(DefReg)) {
+ // We cannot just remove the DefMI here, calling pass will crash.
+ U.RemoveMIUse->setDesc(get(AMDGPU::IMPLICIT_DEF));
+ U.RemoveMIUse->getOperand(0).setIsDead(true);
+ for (unsigned I = U.RemoveMIUse->getNumOperands() - 1; I != 0; --I)
+ U.RemoveMIUse->removeOperand(I);
+ if (LV)
+ LV->getVarInfo(DefReg).AliveBlocks.clear();
+ }
+
+ if (MI.isBundle()) {
+ VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
+ if (!VRI.Reads && !VRI.Writes) {
+ for (MachineOperand &MO : MI.all_uses()) {
+ if (MO.isReg() && MO.getReg() == DefReg) {
+ assert(MO.getSubReg() == 0 &&
+ "tied sub-registers in bundles currently not supported");
+ MI.removeOperand(MO.getOperandNo());
+ break;
+ }
+ }
+
+ if (LIS)
+ LIS->shrinkToUses(&LIS->getInterval(DefReg));
+ }
+ } else if (LIS) {
+ LiveInterval &DefLI = LIS->getInterval(DefReg);
+
+ // We cannot delete the original instruction here, so hack out the use
+ // in the original instruction with a dummy register so we can use
+ // shrinkToUses to deal with any multi-use edge cases. Other targets do
+ // not have the complexity of deleting a use to consider here.
+ Register DummyReg = MRI.cloneVirtualRegister(DefReg);
+ for (MachineOperand &MIOp : MI.uses()) {
+ if (MIOp.isReg() && MIOp.getReg() == DefReg) {
+ MIOp.setIsUndef(true);
+ MIOp.setReg(DummyReg);
+ }
+ }
+
+ if (MI.isBundle()) {
+ VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
+ if (!VRI.Reads && !VRI.Writes) {
+ for (MachineOperand &MIOp : MI.uses()) {
+ if (MIOp.isReg() && MIOp.getReg() == DefReg) {
+ MIOp.setIsUndef(true);
+ MIOp.setReg(DummyReg);
+ }
+ }
+ }
+
+ MI.addOperand(MachineOperand::CreateReg(DummyReg, false, false, false,
+ false, /*isUndef=*/true));
+ }
+
+ LIS->shrinkToUses(&DefLI);
+ }
+ }
+
+ return MI.isBundle() ? &MI : NewMI;
+}
+
+MachineInstr *
+SIInstrInfo::convertToThreeAddressImpl(MachineInstr &MI,
+ ThreeAddressUpdates &U) const {
+ MachineBasicBlock &MBB = *MI.getParent();
+ unsigned Opc = MI.getOpcode();
+
+ // Handle MFMA.
+ int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
+ if (NewMFMAOpc != -1) {
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
+ for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
+ MIB.add(MI.getOperand(I));
return MIB;
}
@@ -4075,13 +4320,8 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
.setMIFlags(MI.getFlags());
- for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
+ for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
MIB->addOperand(MI.getOperand(I));
-
- updateLiveVariables(LV, MI, *MIB);
- if (LIS)
- LIS->ReplaceMachineInstrInMaps(MI, *MIB);
-
return MIB;
}
@@ -4152,39 +4392,6 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
(ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
!RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
MachineInstr *DefMI;
- const auto killDef = [&]() -> void {
- MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
- // The only user is the instruction which will be killed.
- Register DefReg = DefMI->getOperand(0).getReg();
-
- if (MRI.hasOneNonDBGUse(DefReg)) {
- // We cannot just remove the DefMI here, calling pass will crash.
- DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF));
- DefMI->getOperand(0).setIsDead(true);
- for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I)
- DefMI->removeOperand(I);
- if (LV)
- LV->getVarInfo(DefReg).AliveBlocks.clear();
- }
-
- if (LIS) {
- LiveInterval &DefLI = LIS->getInterval(DefReg);
-
- // We cannot delete the original instruction here, so hack out the use
- // in the original instruction with a dummy register so we can use
- // shrinkToUses to deal with any multi-use edge cases. Other targets do
- // not have the complexity of deleting a use to consider here.
- Register DummyReg = MRI.cloneVirtualRegister(DefReg);
- for (MachineOperand &MIOp : MI.uses()) {
- if (MIOp.isReg() && MIOp.getReg() == DefReg) {
- MIOp.setIsUndef(true);
- MIOp.setReg(DummyReg);
- }
- }
-
- LIS->shrinkToUses(&DefLI);
- }
- };
int64_t Imm;
if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
@@ -4196,10 +4403,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
.add(*Src1)
.addImm(Imm)
.setMIFlags(MI.getFlags());
- updateLiveVariables(LV, MI, *MIB);
- if (LIS)
- LIS->ReplaceMachineInstrInMaps(MI, *MIB);
- killDef();
+ U.RemoveMIUse = DefMI;
return MIB;
}
}
@@ -4212,11 +4416,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
.addImm(Imm)
.add(*Src2)
.setMIFlags(MI.getFlags());
- updateLiveVariables(LV, MI, *MIB);
-
- if (LIS)
- LIS->ReplaceMachineInstrInMaps(MI, *MIB);
- killDef();
+ U.RemoveMIUse = DefMI;
return MIB;
}
}
@@ -4235,12 +4435,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
.addImm(Imm)
.add(*Src2)
.setMIFlags(MI.getFlags());
- updateLiveVariables(LV, MI, *MIB);
-
- if (LIS)
- LIS->ReplaceMachineInstrInMaps(MI, *MIB);
- if (DefMI)
- killDef();
+ U.RemoveMIUse = DefMI;
return MIB;
}
}
@@ -4269,9 +4464,6 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
.setMIFlags(MI.getFlags());
if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
MIB.addImm(OpSel ? OpSel->getImm() : 0);
- updateLiveVariables(LV, MI, *MIB);
- if (LIS)
- LIS->ReplaceMachineInstrInMaps(MI, *MIB);
return MIB;
}
@@ -4321,24 +4513,26 @@ bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
changesVGPRIndexingMode(MI);
}
-bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const {
+bool SIInstrInfo::isAlwaysGDS(uint32_t Opcode) const {
return Opcode == AMDGPU::DS_ORDERED_COUNT ||
Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
}
-bool SIInstrInfo::mayAccessScratchThroughFlat(const MachineInstr &MI) const {
- if (!isFLAT(MI) || isFLATGlobal(MI))
- return false;
-
- // If scratch is not initialized, we can never access it.
- if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init"))
+bool SIInstrInfo::mayAccessScratch(const MachineInstr &MI) const {
+ // Instructions that access scratch use FLAT encoding or BUF encodings.
+ if ((!isFLAT(MI) || isFLATGlobal(MI)) && !isBUF(MI))
return false;
// SCRATCH instructions always access scratch.
if (isFLATScratch(MI))
return true;
+ // If FLAT_SCRATCH registers are not initialized, we can never access scratch
+ // via the aperture.
+ if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init"))
+ return false;
+
// If there are no memory operands then conservatively assume the flat
// operation may access scratch.
if (MI.memoperands_empty())
@@ -4569,6 +4763,8 @@ bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const {
case AMDGPU::OPERAND_REG_IMM_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
return AMDGPU::isInlinableLiteralV2F16(Imm);
+ case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT:
+ return AMDGPU::isPKFMACF16InlineConstant(Imm, ST.isGFX11Plus());
case AMDGPU::OPERAND_REG_IMM_V2BF16:
case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
return AMDGPU::isInlinableLiteralV2BF16(Imm);
@@ -4945,8 +5141,8 @@ bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
StringRef &ErrInfo) const {
- uint16_t Opcode = MI.getOpcode();
- const MachineFunction *MF = MI.getParent()->getParent();
+ uint32_t Opcode = MI.getOpcode();
+ const MachineFunction *MF = MI.getMF();
const MachineRegisterInfo &MRI = MF->getRegInfo();
// FIXME: At this point the COPY verify is done only for non-ssa forms.
@@ -5036,6 +5232,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
case AMDGPU::OPERAND_REG_IMM_FP16:
case AMDGPU::OPERAND_REG_IMM_FP64:
case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT:
case AMDGPU::OPERAND_REG_IMM_V2INT16:
case AMDGPU::OPERAND_REG_IMM_V2INT32:
case AMDGPU::OPERAND_REG_IMM_V2BF16:
@@ -5104,7 +5301,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
// aligned register constraint.
// FIXME: We do not verify inline asm operands, but custom inline asm
// verification is broken anyway
- if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO) {
+ if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO &&
+ Opcode != AMDGPU::V_MOV_B64_PSEUDO) {
const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
if (const TargetRegisterClass *SubRC =
@@ -5200,7 +5398,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
}
- uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
+ uint32_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
if (isVOPC(BasicOpcode)) {
if (!ST.hasSDWASdst() && DstIdx != -1) {
// Only vcc allowed as dst on VI for VOPC
@@ -5450,9 +5648,10 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
Desc.getNumOperands() + Desc.implicit_uses().size();
const unsigned NumImplicitOps = IsDst ? 2 : 1;
- // Allow additional implicit operands. This allows a fixup done by the post
- // RA scheduler where the main implicit operand is killed and implicit-defs
- // are added for sub-registers that remain live after this instruction.
+ // Require additional implicit operands. This allows a fixup done by the
+ // post RA scheduler where the main implicit operand is killed and
+ // implicit-defs are added for sub-registers that remain live after this
+ // instruction.
if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
ErrInfo = "missing implicit register operands";
return false;
@@ -5734,6 +5933,17 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
}
+ if (ST.hasFlatScratchHiInB64InstHazard() && isSALU(MI) &&
+ MI.readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, nullptr)) {
+ const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
+ if ((Dst && RI.getRegClassForReg(MRI, Dst->getReg()) ==
+ &AMDGPU::SReg_64RegClass) ||
+ Opcode == AMDGPU::S_BITCMP0_B64 || Opcode == AMDGPU::S_BITCMP1_B64) {
+ ErrInfo = "Instruction cannot read flat_scratch_base_hi";
+ return false;
+ }
+ }
+
return true;
}
@@ -5752,17 +5962,17 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
case AMDGPU::S_MOV_B32: {
- const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
return MI.getOperand(1).isReg() ||
RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
}
case AMDGPU::S_ADD_I32:
- return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
+ return ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
case AMDGPU::S_ADDC_U32:
return AMDGPU::V_ADDC_U32_e32;
case AMDGPU::S_SUB_I32:
- return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
+ return ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
// FIXME: These are not consistently handled, and selected when the carry is
// used.
case AMDGPU::S_ADD_U32:
@@ -6019,19 +6229,6 @@ SIInstrInfo::getWholeWaveFunctionSetup(MachineFunction &MF) const {
llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
}
-// FIXME: This should not be an overridable function. All subtarget dependent
-// operand modifications should go through isLookupRegClassByHwMode in the
-// generic handling.
-const TargetRegisterClass *
-SIInstrInfo::getRegClass(const MCInstrDesc &TID, unsigned OpNum,
- const TargetRegisterInfo *TRI) const {
- if (OpNum >= TID.getNumOperands())
- return nullptr;
- const MCOperandInfo &OpInfo = TID.operands()[OpNum];
- int16_t RegClass = getOpRegClassID(OpInfo);
- return RI.getRegClass(RegClass);
-}
-
const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
unsigned OpNo) const {
const MCInstrDesc &Desc = get(MI.getOpcode());
@@ -6040,14 +6237,14 @@ const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
Register Reg = MI.getOperand(OpNo).getReg();
if (Reg.isVirtual()) {
- const MachineRegisterInfo &MRI =
- MI.getParent()->getParent()->getRegInfo();
+ const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
return MRI.getRegClass(Reg);
}
return RI.getPhysRegBaseClass(Reg);
}
- return RI.getRegClass(getOpRegClassID(Desc.operands()[OpNo]));
+ int16_t RegClass = getOpRegClassID(Desc.operands()[OpNo]);
+ return RegClass < 0 ? nullptr : RI.getRegClass(RegClass);
}
void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
@@ -6086,7 +6283,7 @@ unsigned SIInstrInfo::buildExtractSubReg(
unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);
BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
- .addReg(SuperReg.getReg(), 0, NewSubIdx);
+ .addReg(SuperReg.getReg(), {}, NewSubIdx);
return SubReg;
}
@@ -6131,7 +6328,7 @@ bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
const TargetRegisterClass *RC = MRI.getRegClass(Reg);
if (MO.getSubReg()) {
- const MachineFunction *MF = MO.getParent()->getParent()->getParent();
+ const MachineFunction *MF = MO.getParent()->getMF();
const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
if (!SuperRC)
return false;
@@ -6143,7 +6340,7 @@ bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx,
const MachineOperand &MO) const {
- const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
unsigned Opc = MI.getOpcode();
@@ -6151,7 +6348,7 @@ bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx,
// information.
if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) &&
MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) {
- constexpr const AMDGPU::OpName OpNames[] = {
+ constexpr AMDGPU::OpName OpNames[] = {
AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
for (auto [I, OpName] : enumerate(OpNames)) {
@@ -6196,6 +6393,18 @@ bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx,
(int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
RI.isSGPRReg(MRI, MO.getReg()))
return false;
+
+ if (ST.hasFlatScratchHiInB64InstHazard() &&
+ MO.getReg() == AMDGPU::SRC_FLAT_SCRATCH_BASE_HI && isSALU(MI)) {
+ if (const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst)) {
+ if (AMDGPU::getRegBitWidth(*RI.getRegClassForReg(MRI, Dst->getReg())) ==
+ 64)
+ return false;
+ }
+ if (Opc == AMDGPU::S_BITCMP0_B64 || Opc == AMDGPU::S_BITCMP1_B64)
+ return false;
+ }
+
return true;
}
@@ -6213,8 +6422,8 @@ bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
bool SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand(
const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
const MachineOperand *MO) const {
- constexpr const unsigned NumOps = 3;
- constexpr const AMDGPU::OpName OpNames[NumOps * 2] = {
+ constexpr unsigned NumOps = 3;
+ constexpr AMDGPU::OpName OpNames[NumOps * 2] = {
AMDGPU::OpName::src0, AMDGPU::OpName::src1,
AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers,
AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};
@@ -6245,7 +6454,7 @@ bool SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand(
bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
const MachineOperand *MO) const {
- const MachineFunction &MF = *MI.getParent()->getParent();
+ const MachineFunction &MF = *MI.getMF();
const MachineRegisterInfo &MRI = MF.getRegInfo();
const MCInstrDesc &InstDesc = MI.getDesc();
const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
@@ -6670,7 +6879,7 @@ Register SIInstrInfo::readlaneVGPRToSGPR(
Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
- .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
+ .addReg(SrcReg, {}, RI.getSubRegFromChannel(i));
SRegs.push_back(SGPR);
}
@@ -6799,7 +7008,7 @@ void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI,
return;
const TargetRegisterClass *DeclaredRC =
- getRegClass(MI.getDesc(), SAddr->getOperandNo(), &RI);
+ getRegClass(MI.getDesc(), SAddr->getOperandNo());
Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC);
SAddr->setReg(ToSGPR);
@@ -6898,7 +7107,7 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII,
ScalarOp->setIsKill();
} else {
SmallVector<Register, 8> ReadlanePieces;
- unsigned VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
+ RegState VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
"Unhandled register size");
@@ -7141,7 +7350,7 @@ extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) {
MachineBasicBlock *
SIInstrInfo::legalizeOperands(MachineInstr &MI,
MachineDominatorTree *MDT) const {
- MachineFunction &MF = *MI.getParent()->getParent();
+ MachineFunction &MF = *MI.getMF();
MachineRegisterInfo &MRI = MF.getRegInfo();
MachineBasicBlock *CreatedBB = nullptr;
@@ -7169,44 +7378,12 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
return CreatedBB;
}
- // Legalize REG_SEQUENCE and PHI
- // The register class of the operands much be the same type as the register
+ // Legalize PHI
+ // The register class of the operands must be the same type as the register
// class of the output.
if (MI.getOpcode() == AMDGPU::PHI) {
- const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
- for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
- if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
- continue;
- const TargetRegisterClass *OpRC =
- MRI.getRegClass(MI.getOperand(i).getReg());
- if (RI.hasVectorRegisters(OpRC)) {
- VRC = OpRC;
- } else {
- SRC = OpRC;
- }
- }
-
- // If any of the operands are VGPR registers, then they all most be
- // otherwise we will create illegal VGPR->SGPR copies when legalizing
- // them.
- if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
- if (!VRC) {
- assert(SRC);
- if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {
- VRC = &AMDGPU::VReg_1RegClass;
- } else
- VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
- ? RI.getEquivalentAGPRClass(SRC)
- : RI.getEquivalentVGPRClass(SRC);
- } else {
- VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
- ? RI.getEquivalentAGPRClass(VRC)
- : RI.getEquivalentVGPRClass(VRC);
- }
- RC = VRC;
- } else {
- RC = SRC;
- }
+ const TargetRegisterClass *VRC = getOpRegClass(MI, 0);
+ assert(!RI.isSGPRClass(VRC));
// Update all the operands so they have the same type.
for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
@@ -7220,7 +7397,7 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
// Avoid creating no-op copies with the same src and dst reg class. These
// confuse some of the machine passes.
- legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
+ legalizeGenericOperand(*InsertBB, Insert, VRC, Op, MRI, MI.getDebugLoc());
}
}
@@ -7426,18 +7603,18 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
// NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
const DebugLoc &DL = MI.getDebugLoc();
BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
- .addDef(CondReg0)
- .addReg(RsrcPtr, 0, AMDGPU::sub0)
- .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
- .addImm(0);
+ .addDef(CondReg0)
+ .addReg(RsrcPtr, {}, AMDGPU::sub0)
+ .addReg(VAddr->getReg(), {}, AMDGPU::sub0)
+ .addImm(0);
// NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
- .addDef(CondReg1, RegState::Dead)
- .addReg(RsrcPtr, 0, AMDGPU::sub1)
- .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
- .addReg(CondReg0, RegState::Kill)
- .addImm(0);
+ .addDef(CondReg1, RegState::Dead)
+ .addReg(RsrcPtr, {}, AMDGPU::sub1)
+ .addReg(VAddr->getReg(), {}, AMDGPU::sub1)
+ .addReg(CondReg0, RegState::Kill)
+ .addImm(0);
// NewVaddr = {NewVaddrHi, NewVaddrLo}
BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
@@ -7510,9 +7687,9 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
// NewVaddr = {NewVaddrHi, NewVaddrLo}
BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
NewVAddr)
- .addReg(RsrcPtr, 0, AMDGPU::sub0)
+ .addReg(RsrcPtr, {}, AMDGPU::sub0)
.addImm(AMDGPU::sub0)
- .addReg(RsrcPtr, 0, AMDGPU::sub1)
+ .addReg(RsrcPtr, {}, AMDGPU::sub1)
.addImm(AMDGPU::sub1);
} else {
// Legalize a VGPR Rsrc and soffset together.
@@ -7630,6 +7807,8 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
unsigned Opcode = Inst.getOpcode();
unsigned NewOpcode = getVALUOp(Inst);
+ const DebugLoc &DL = Inst.getDebugLoc();
+
// Handle some special cases
switch (Opcode) {
default:
@@ -7781,6 +7960,11 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
Inst.eraseFromParent();
return;
+ case AMDGPU::S_ABSDIFF_I32:
+ lowerScalarAbsDiff(Worklist, Inst);
+ Inst.eraseFromParent();
+ return;
+
case AMDGPU::S_CBRANCH_SCC0:
case AMDGPU::S_CBRANCH_SCC1: {
// Clear unused bits of vcc
@@ -7867,7 +8051,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
return;
case AMDGPU::S_UADDO_PSEUDO:
case AMDGPU::S_USUBO_PSEUDO: {
- const DebugLoc &DL = Inst.getDebugLoc();
MachineOperand &Dest0 = Inst.getOperand(0);
MachineOperand &Dest1 = Inst.getOperand(1);
MachineOperand &Src0 = Inst.getOperand(2);
@@ -7887,12 +8070,37 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
legalizeOperands(*NewInstr, MDT);
MRI.replaceRegWith(Dest0.getReg(), DestReg);
- addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
- Worklist);
+ addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
Inst.eraseFromParent();
}
return;
+ case AMDGPU::S_LSHL1_ADD_U32:
+ case AMDGPU::S_LSHL2_ADD_U32:
+ case AMDGPU::S_LSHL3_ADD_U32:
+ case AMDGPU::S_LSHL4_ADD_U32: {
+ MachineOperand &Dest = Inst.getOperand(0);
+ MachineOperand &Src0 = Inst.getOperand(1);
+ MachineOperand &Src1 = Inst.getOperand(2);
+ unsigned ShiftAmt = (Opcode == AMDGPU::S_LSHL1_ADD_U32 ? 1
+ : Opcode == AMDGPU::S_LSHL2_ADD_U32 ? 2
+ : Opcode == AMDGPU::S_LSHL3_ADD_U32 ? 3
+ : 4);
+
+ const TargetRegisterClass *NewRC =
+ RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg()));
+ Register DestReg = MRI.createVirtualRegister(NewRC);
+ MachineInstr *NewInstr =
+ BuildMI(*MBB, &Inst, DL, get(AMDGPU::V_LSHL_ADD_U32_e64), DestReg)
+ .add(Src0)
+ .addImm(ShiftAmt)
+ .add(Src1);
+ legalizeOperands(*NewInstr, MDT);
+ MRI.replaceRegWith(Dest.getReg(), DestReg);
+ addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
+ Inst.eraseFromParent();
+ }
+ return;
case AMDGPU::S_CSELECT_B32:
case AMDGPU::S_CSELECT_B64:
lowerSelect(Worklist, Inst, MDT);
@@ -7943,7 +8151,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
}
legalizeOperands(*NewInstr, MDT);
int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
- MachineOperand SCCOp = Inst.getOperand(SCCIdx);
+ const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
Inst.eraseFromParent();
return;
@@ -7983,13 +8191,12 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
legalizeOperandsVALUt16(*NewInstr, MRI);
legalizeOperands(*NewInstr, MDT);
int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
- MachineOperand SCCOp = Inst.getOperand(SCCIdx);
+ const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
Inst.eraseFromParent();
return;
}
case AMDGPU::S_CVT_HI_F32_F16: {
- const DebugLoc &DL = Inst.getDebugLoc();
Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
if (ST.useRealTrue16Insts()) {
@@ -7997,7 +8204,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
.add(Inst.getOperand(1));
BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
.addImm(0) // src0_modifiers
- .addReg(TmpReg, 0, AMDGPU::hi16)
+ .addReg(TmpReg, {}, AMDGPU::hi16)
.addImm(0) // clamp
.addImm(0) // omod
.addImm(0); // op_sel0
@@ -8019,7 +8226,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
}
case AMDGPU::S_MINIMUM_F32:
case AMDGPU::S_MAXIMUM_F32: {
- const DebugLoc &DL = Inst.getDebugLoc();
Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
.addImm(0) // src0_modifiers
@@ -8037,7 +8243,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
}
case AMDGPU::S_MINIMUM_F16:
case AMDGPU::S_MAXIMUM_F16: {
- const DebugLoc &DL = Inst.getDebugLoc();
Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
? &AMDGPU::VGPR_16RegClass
: &AMDGPU::VGPR_32RegClass);
@@ -8061,7 +8266,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
case AMDGPU::V_S_RCP_F16_e64:
case AMDGPU::V_S_RSQ_F16_e64:
case AMDGPU::V_S_SQRT_F16_e64: {
- const DebugLoc &DL = Inst.getDebugLoc();
Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
? &AMDGPU::VGPR_16RegClass
: &AMDGPU::VGPR_32RegClass);
@@ -8110,26 +8314,34 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
return;
}
- if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
- NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
- // Instead of creating a copy where src and dst are the same register
- // class, we just replace all uses of dst with src. These kinds of
- // copies interfere with the heuristics MachineSink uses to decide
- // whether or not to split a critical edge. Since the pass assumes
- // that copies will end up as machine instructions and not be
- // eliminated.
- addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
+ if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual()) {
Register NewDstReg = Inst.getOperand(1).getReg();
- MRI.replaceRegWith(DstReg, NewDstReg);
- MRI.clearKillFlags(NewDstReg);
- Inst.getOperand(0).setReg(DstReg);
- Inst.eraseFromParent();
- // Legalize t16 operand since replaceReg is called after addUsersToVALU
- for (MachineOperand &MO :
- make_early_inc_range(MRI.use_operands(NewDstReg))) {
- legalizeOperandsVALUt16(*MO.getParent(), MRI);
+ const TargetRegisterClass *SrcRC = RI.getRegClassForReg(MRI, NewDstReg);
+ if (const TargetRegisterClass *CommonRC =
+ RI.getCommonSubClass(NewDstRC, SrcRC)) {
+ // Instead of creating a copy where src and dst are the same register
+ // class, we just replace all uses of dst with src. These kinds of
+ // copies interfere with the heuristics MachineSink uses to decide
+ // whether or not to split a critical edge. Since the pass assumes
+ // that copies will end up as machine instructions and not be
+ // eliminated.
+ addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
+ MRI.replaceRegWith(DstReg, NewDstReg);
+ MRI.clearKillFlags(NewDstReg);
+ Inst.getOperand(0).setReg(DstReg);
+
+ if (!MRI.constrainRegClass(NewDstReg, CommonRC))
+ llvm_unreachable("failed to constrain register");
+
+ Inst.eraseFromParent();
+ // Legalize t16 operand since replaceReg is called after addUsersToVALU
+ for (MachineOperand &MO :
+ make_early_inc_range(MRI.use_operands(NewDstReg))) {
+ legalizeOperandsVALUt16(*MO.getParent(), MRI);
+ }
+
+ return;
}
- return;
}
// If this is a v2s copy between 16bit and 32bit reg,
@@ -8181,7 +8393,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
AMDGPU::OpName::src0_modifiers) >= 0)
NewInstr.addImm(0);
if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
- MachineOperand Src = Inst.getOperand(1);
+ const MachineOperand &Src = Inst.getOperand(1);
NewInstr->addOperand(Src);
}
@@ -8268,7 +8480,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
std::pair<bool, MachineBasicBlock *>
SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
MachineDominatorTree *MDT) const {
- if (ST.hasAddNoCarry()) {
+ if (ST.hasAddNoCarryInsts()) {
// Assume there is no user of scc since we don't select this in that case.
// Since scc isn't used, it doesn't really matter if the i32 or u32 variant
// is used.
@@ -8307,7 +8519,7 @@ void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
MachineBasicBlock &MBB = *Inst.getParent();
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
MachineBasicBlock::iterator MII = Inst;
- DebugLoc DL = Inst.getDebugLoc();
+ const DebugLoc &DL = Inst.getDebugLoc();
MachineOperand &Dest = Inst.getOperand(0);
MachineOperand &Src0 = Inst.getOperand(1);
@@ -8388,15 +8600,15 @@ void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
MachineBasicBlock &MBB = *Inst.getParent();
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
MachineBasicBlock::iterator MII = Inst;
- DebugLoc DL = Inst.getDebugLoc();
+ const DebugLoc &DL = Inst.getDebugLoc();
MachineOperand &Dest = Inst.getOperand(0);
MachineOperand &Src = Inst.getOperand(1);
Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- unsigned SubOp = ST.hasAddNoCarry() ?
- AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
+ unsigned SubOp = ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e32
+ : AMDGPU::V_SUB_CO_U32_e32;
BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
.addImm(0)
@@ -8410,6 +8622,37 @@ void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
}
+void SIInstrInfo::lowerScalarAbsDiff(SIInstrWorklist &Worklist,
+ MachineInstr &Inst) const {
+ MachineBasicBlock &MBB = *Inst.getParent();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ MachineBasicBlock::iterator MII = Inst;
+ const DebugLoc &DL = Inst.getDebugLoc();
+
+ MachineOperand &Dest = Inst.getOperand(0);
+ MachineOperand &Src1 = Inst.getOperand(1);
+ MachineOperand &Src2 = Inst.getOperand(2);
+ Register SubResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+ unsigned SubOp = ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e32
+ : AMDGPU::V_SUB_CO_U32_e32;
+
+ BuildMI(MBB, MII, DL, get(SubOp), SubResultReg)
+ .addReg(Src1.getReg())
+ .addReg(Src2.getReg());
+
+ BuildMI(MBB, MII, DL, get(SubOp), TmpReg).addImm(0).addReg(SubResultReg);
+
+ BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
+ .addReg(SubResultReg)
+ .addReg(TmpReg);
+
+ MRI.replaceRegWith(Dest.getReg(), ResultReg);
+ addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
+}
+
void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
MachineInstr &Inst) const {
MachineBasicBlock &MBB = *Inst.getParent();
@@ -8541,7 +8784,7 @@ void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
MachineOperand &Dest = Inst.getOperand(0);
MachineOperand &Src0 = Inst.getOperand(1);
- DebugLoc DL = Inst.getDebugLoc();
+ const DebugLoc &DL = Inst.getDebugLoc();
MachineBasicBlock::iterator MII = Inst;
@@ -8775,7 +9018,7 @@ void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
MachineOperand &Dest = Inst.getOperand(0);
MachineOperand &Src0 = Inst.getOperand(1);
MachineOperand &Src1 = Inst.getOperand(2);
- DebugLoc DL = Inst.getDebugLoc();
+ const DebugLoc &DL = Inst.getDebugLoc();
MachineBasicBlock::iterator MII = Inst;
@@ -8937,7 +9180,7 @@ void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
- .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
+ .addReg(Inst.getOperand(1).getReg(), {}, AMDGPU::sub0)
.addImm(0)
.addImm(BitWidth);
@@ -8961,14 +9204,14 @@ void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
- .addImm(31)
- .addReg(Src.getReg(), 0, AMDGPU::sub0);
+ .addImm(31)
+ .addReg(Src.getReg(), {}, AMDGPU::sub0);
BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
- .addReg(Src.getReg(), 0, AMDGPU::sub0)
- .addImm(AMDGPU::sub0)
- .addReg(TmpReg)
- .addImm(AMDGPU::sub1);
+ .addReg(Src.getReg(), {}, AMDGPU::sub0)
+ .addImm(AMDGPU::sub0)
+ .addReg(TmpReg)
+ .addImm(AMDGPU::sub1);
MRI.replaceRegWith(Dest.getReg(), ResultReg);
addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
@@ -8993,8 +9236,8 @@ void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
const MCInstrDesc &InstDesc = get(Opcode);
bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
- unsigned OpcodeAdd =
- ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
+ unsigned OpcodeAdd = ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_e64
+ : AMDGPU::V_ADD_CO_U32_e32;
const TargetRegisterClass *SrcRC =
Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
@@ -9072,6 +9315,67 @@ void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
MachineOperand &Src1 = Inst.getOperand(2);
const DebugLoc &DL = Inst.getDebugLoc();
+ if (ST.useRealTrue16Insts()) {
+ Register SrcReg0, SrcReg1;
+ if (!Src0.isReg() || !RI.isVGPR(MRI, Src0.getReg())) {
+ SrcReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), SrcReg0).add(Src0);
+ } else {
+ SrcReg0 = Src0.getReg();
+ }
+
+ if (!Src1.isReg() || !RI.isVGPR(MRI, Src1.getReg())) {
+ SrcReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), SrcReg1).add(Src1);
+ } else {
+ SrcReg1 = Src1.getReg();
+ }
+
+ bool isSrc0Reg16 = MRI.constrainRegClass(SrcReg0, &AMDGPU::VGPR_16RegClass);
+ bool isSrc1Reg16 = MRI.constrainRegClass(SrcReg1, &AMDGPU::VGPR_16RegClass);
+
+ auto NewMI = BuildMI(*MBB, Inst, DL, get(AMDGPU::REG_SEQUENCE), ResultReg);
+ switch (Inst.getOpcode()) {
+ case AMDGPU::S_PACK_LL_B32_B16:
+ NewMI
+ .addReg(SrcReg0, {},
+ isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
+ .addImm(AMDGPU::lo16)
+ .addReg(SrcReg1, {},
+ isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
+ .addImm(AMDGPU::hi16);
+ break;
+ case AMDGPU::S_PACK_LH_B32_B16:
+ NewMI
+ .addReg(SrcReg0, {},
+ isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
+ .addImm(AMDGPU::lo16)
+ .addReg(SrcReg1, {}, AMDGPU::hi16)
+ .addImm(AMDGPU::hi16);
+ break;
+ case AMDGPU::S_PACK_HL_B32_B16:
+ NewMI.addReg(SrcReg0, {}, AMDGPU::hi16)
+ .addImm(AMDGPU::lo16)
+ .addReg(SrcReg1, {},
+ isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
+ .addImm(AMDGPU::hi16);
+ break;
+ case AMDGPU::S_PACK_HH_B32_B16:
+ NewMI.addReg(SrcReg0, {}, AMDGPU::hi16)
+ .addImm(AMDGPU::lo16)
+ .addReg(SrcReg1, {}, AMDGPU::hi16)
+ .addImm(AMDGPU::hi16);
+ break;
+ default:
+ llvm_unreachable("unhandled s_pack_* instruction");
+ }
+
+ MachineOperand &Dest = Inst.getOperand(0);
+ MRI.replaceRegWith(Dest.getReg(), ResultReg);
+ addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
+ return;
+ }
+
switch (Inst.getOpcode()) {
case AMDGPU::S_PACK_LL_B32_B16: {
Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
@@ -9136,7 +9440,7 @@ void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
}
-void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
+void SIInstrInfo::addSCCDefUsersToVALUWorklist(const MachineOperand &Op,
MachineInstr &SCCDefInst,
SIInstrWorklist &Worklist,
Register NewCond) const {
@@ -9154,7 +9458,7 @@ void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
if (SCCIdx != -1) {
if (MI.isCopy()) {
- MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
Register DestReg = MI.getOperand(0).getReg();
MRI.replaceRegWith(DestReg, NewCond);
@@ -9266,7 +9570,7 @@ Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
return SGPRReg;
Register UsedSGPRs[3] = {Register()};
- const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
for (unsigned i = 0; i < 3; ++i) {
int Idx = OpIndices[i];
@@ -9490,7 +9794,14 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
LiteralSize = 8;
break;
case AMDGPU::OPERAND_REG_IMM_INT64:
- if (!Op.isImm() || !AMDGPU::isValid32BitLiteral(Op.getImm(), false))
+ // A 32-bit literal is only valid when the value fits in BOTH signed
+ // and unsigned 32-bit ranges [0, 2^31-1], matching the MC code
+ // emitter's getLit64Encoding logic. This is because of the lack of
+ // abilility to tell signedness of the literal, therefore we need to
+ // be conservative and assume values outside this range require a
+ // 64-bit literal encoding (8 bytes).
+ if (!Op.isImm() || !isInt<32>(Op.getImm()) ||
+ !isUInt<32>(Op.getImm()))
LiteralSize = 8;
break;
}
@@ -9516,7 +9827,7 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
return getInstBundleSize(MI);
case TargetOpcode::INLINEASM:
case TargetOpcode::INLINEASM_BR: {
- const MachineFunction *MF = MI.getParent()->getParent();
+ const MachineFunction *MF = MI.getMF();
const char *AsmStr = MI.getOperand(0).getSymbolName();
return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
}
@@ -9628,6 +9939,7 @@ SIInstrInfo::getSerializableMachineMemOperandTargetFlags() const {
{MONoClobber, "amdgpu-noclobber"},
{MOLastUse, "amdgpu-last-use"},
{MOCooperative, "amdgpu-cooperative"},
+ {MOThreadPrivate, "amdgpu-thread-private"},
};
return ArrayRef(TargetFlags);
@@ -9643,6 +9955,30 @@ unsigned SIInstrInfo::getLiveRangeSplitOpcode(Register SrcReg,
return AMDGPU::COPY;
}
+bool SIInstrInfo::canAddToBBProlog(const MachineInstr &MI) const {
+ uint32_t Opcode = MI.getOpcode();
+ // Check if it is SGPR spill or wwm-register spill Opcode.
+ if (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode))
+ return true;
+
+ const MachineFunction *MF = MI.getMF();
+ const MachineRegisterInfo &MRI = MF->getRegInfo();
+ const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+
+ // See if this is Liverange split instruction inserted for SGPR or
+ // wwm-register. The implicit def inserted for wwm-registers should also be
+ // included as they can appear at the bb begin.
+ bool IsLRSplitInst = MI.getFlag(MachineInstr::LRSplit);
+ if (!IsLRSplitInst && Opcode != AMDGPU::IMPLICIT_DEF)
+ return false;
+
+ Register Reg = MI.getOperand(0).getReg();
+ if (RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg)))
+ return IsLRSplitInst;
+
+ return MFI->isWWMReg(Reg);
+}
+
bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI,
Register Reg) const {
// We need to handle instructions which may be inserted during register
@@ -9651,20 +9987,16 @@ bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI,
// needed by the prolog. However, the insertions for scalar registers can
// always be placed at the BB top as they are independent of the exec mask
// value.
- const MachineFunction *MF = MI.getParent()->getParent();
bool IsNullOrVectorRegister = true;
if (Reg) {
+ const MachineFunction *MF = MI.getMF();
const MachineRegisterInfo &MRI = MF->getRegInfo();
IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
}
- uint16_t Opcode = MI.getOpcode();
- const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
return IsNullOrVectorRegister &&
- (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode) ||
- (Opcode == AMDGPU::IMPLICIT_DEF &&
- MFI->isWWMReg(MI.getOperand(0).getReg())) ||
- (!MI.isTerminator() && Opcode != AMDGPU::COPY &&
+ (canAddToBBProlog(MI) ||
+ (!MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
MI.modifiesRegister(AMDGPU::EXEC, &RI)));
}
@@ -9673,7 +10005,7 @@ SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
const DebugLoc &DL,
Register DestReg) const {
- if (ST.hasAddNoCarry())
+ if (ST.hasAddNoCarryInsts())
return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -9689,7 +10021,7 @@ MachineInstrBuilder SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
const DebugLoc &DL,
Register DestReg,
RegScavenger &RS) const {
- if (ST.hasAddNoCarry())
+ if (ST.hasAddNoCarryInsts())
return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
// If available, prefer to use vcc.
@@ -9746,6 +10078,9 @@ void SIInstrInfo::fixImplicitOperands(MachineInstr &MI) const {
if (MI.isInlineAsm())
return;
+ if (MI.getNumOperands() < MI.getNumExplicitOperands())
+ return;
+
for (auto &Op : MI.implicit_operands()) {
if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
Op.setReg(AMDGPU::VCC_LO);
@@ -9928,6 +10263,8 @@ static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
case AMDGPUSubtarget::GFX12:
return ST.hasGFX1250Insts() ? SIEncodingFamily::GFX1250
: SIEncodingFamily::GFX12;
+ case AMDGPUSubtarget::GFX13:
+ return SIEncodingFamily::GFX13;
}
llvm_unreachable("Unknown subtarget generation!");
}
@@ -9986,7 +10323,8 @@ static bool isRenamedInGFX9(int Opcode) {
}
int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
- Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);
+ assert(Opcode == (int)SIInstrInfo::getNonSoftWaitcntOpcode(Opcode) &&
+ "SIInsertWaitcnts should have promoted soft waitcnt instructions!");
unsigned Gen = subtargetEncodingFamily(ST);
@@ -10019,9 +10357,9 @@ int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
Opcode = MFMAOp;
}
- int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
+ int64_t MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
- if (MCOp == (uint16_t)-1 && ST.hasGFX1250Insts())
+ if (MCOp == (uint32_t)-1 && ST.hasGFX1250Insts())
MCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX12);
// -1 means that Opcode is already a native instruction.
@@ -10029,20 +10367,20 @@ int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
return Opcode;
if (ST.hasGFX90AInsts()) {
- uint16_t NMCOp = (uint16_t)-1;
+ uint32_t NMCOp = (uint32_t)-1;
if (ST.hasGFX940Insts())
NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX940);
- if (NMCOp == (uint16_t)-1)
+ if (NMCOp == (uint32_t)-1)
NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX90A);
- if (NMCOp == (uint16_t)-1)
+ if (NMCOp == (uint32_t)-1)
NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX9);
- if (NMCOp != (uint16_t)-1)
+ if (NMCOp != (uint32_t)-1)
MCOp = NMCOp;
}
- // (uint16_t)-1 means that Opcode is a pseudo instruction that has
+ // (uint32_t)-1 means that Opcode is a pseudo instruction that has
// no encoding in the given subtarget generation.
- if (MCOp == (uint16_t)-1)
+ if (MCOp == (uint32_t)-1)
return -1;
if (isAsmOnlyOpcode(MCOp))
@@ -10097,7 +10435,7 @@ static bool followSubRegDef(MachineInstr &MI,
}
MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
- MachineRegisterInfo &MRI) {
+ const MachineRegisterInfo &MRI) {
assert(MRI.isSSA());
if (!P.Reg.isVirtual())
return nullptr;
@@ -10246,7 +10584,7 @@ MachineInstr *SIInstrInfo::createPHISourceCopy(
InsPt++;
return BuildMI(MBB, InsPt, DL,
get(AMDGPU::LaneMaskConstants::get(ST).MovTermOpc), Dst)
- .addReg(Src, 0, SrcSubReg)
+ .addReg(Src, {}, SrcSubReg)
.addReg(AMDGPU::EXEC, RegState::Implicit);
}
return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
@@ -10310,6 +10648,14 @@ unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
return SchedModel.computeInstrLatency(&MI);
}
+const MachineOperand &
+SIInstrInfo::getCalleeOperand(const MachineInstr &MI) const {
+ if (const MachineOperand *CallAddrOp =
+ getNamedOperand(MI, AMDGPU::OpName::src0))
+ return *CallAddrOp;
+ return TargetInstrInfo::getCalleeOperand(MI);
+}
+
InstructionUniformity
SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
@@ -10385,6 +10731,12 @@ SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
return InstructionUniformity::Default;
}
+const MIRFormatter *SIInstrInfo::getMIRFormatter() const {
+ if (!Formatter)
+ Formatter = std::make_unique<AMDGPUMIRFormatter>(ST);
+ return Formatter.get();
+}
+
InstructionUniformity
SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
@@ -10438,7 +10790,7 @@ SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
return InstructionUniformity::Default;
}
- const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
// FIXME: It's conceptually broken to report this for an instruction, and not
@@ -10555,6 +10907,135 @@ bool SIInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
return false;
}
+static bool isSCCDeadOnExit(MachineBasicBlock *MBB) {
+ for (MachineBasicBlock *S : MBB->successors()) {
+ if (S->isLiveIn(AMDGPU::SCC))
+ return false;
+ }
+ return true;
+}
+
+// Invert all uses of SCC following SCCDef because SCCDef may be deleted and
+// (incoming SCC) = !(SCC defined by SCCDef).
+// Return true if all uses can be re-written, false otherwise.
+bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const {
+ MachineBasicBlock *MBB = SCCDef->getParent();
+ SmallVector<MachineInstr *> InvertInstr;
+ bool SCCIsDead = false;
+
+ // Scan instructions for SCC uses that need to be inverted until SCC is dead.
+ constexpr unsigned ScanLimit = 12;
+ unsigned Count = 0;
+ for (MachineInstr &MI :
+ make_range(std::next(MachineBasicBlock::iterator(SCCDef)), MBB->end())) {
+ if (++Count > ScanLimit)
+ return false;
+ if (MI.readsRegister(AMDGPU::SCC, &RI)) {
+ if (MI.getOpcode() == AMDGPU::S_CSELECT_B32 ||
+ MI.getOpcode() == AMDGPU::S_CSELECT_B64 ||
+ MI.getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
+ MI.getOpcode() == AMDGPU::S_CBRANCH_SCC1)
+ InvertInstr.push_back(&MI);
+ else
+ return false;
+ }
+ if (MI.definesRegister(AMDGPU::SCC, &RI)) {
+ SCCIsDead = true;
+ break;
+ }
+ }
+ if (!SCCIsDead && isSCCDeadOnExit(MBB))
+ SCCIsDead = true;
+
+ // SCC may have more uses. Can't invert all of them.
+ if (!SCCIsDead)
+ return false;
+
+ // Invert uses
+ for (MachineInstr *MI : InvertInstr) {
+ if (MI->getOpcode() == AMDGPU::S_CSELECT_B32 ||
+ MI->getOpcode() == AMDGPU::S_CSELECT_B64) {
+ swapOperands(*MI);
+ } else if (MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
+ MI->getOpcode() == AMDGPU::S_CBRANCH_SCC1) {
+ MI->setDesc(get(MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0
+ ? AMDGPU::S_CBRANCH_SCC1
+ : AMDGPU::S_CBRANCH_SCC0));
+ } else {
+ llvm_unreachable("SCC used but no inversion handling");
+ }
+ }
+ return true;
+}
+
+// SCC is already valid after SCCValid.
+// SCCRedefine will redefine SCC to the same value already available after
+// SCCValid. If there are no intervening SCC conflicts delete SCCRedefine and
+// update kill/dead flags if necessary.
+bool SIInstrInfo::optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine,
+ bool NeedInversion) const {
+ MachineInstr *KillsSCC = nullptr;
+ if (SCCValid->getParent() != SCCRedefine->getParent())
+ return false;
+ for (MachineInstr &MI : make_range(std::next(SCCValid->getIterator()),
+ SCCRedefine->getIterator())) {
+ if (MI.modifiesRegister(AMDGPU::SCC, &RI))
+ return false;
+ if (MI.killsRegister(AMDGPU::SCC, &RI))
+ KillsSCC = &MI;
+ }
+ if (NeedInversion && !invertSCCUse(SCCRedefine))
+ return false;
+ if (MachineOperand *SccDef =
+ SCCValid->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
+ SccDef->setIsDead(false);
+ if (KillsSCC)
+ KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
+ SCCRedefine->eraseFromParent();
+ return true;
+}
+
+static bool foldableSelect(const MachineInstr &Def) {
+ if (Def.getOpcode() != AMDGPU::S_CSELECT_B32 &&
+ Def.getOpcode() != AMDGPU::S_CSELECT_B64)
+ return false;
+ bool Op1IsNonZeroImm =
+ Def.getOperand(1).isImm() && Def.getOperand(1).getImm() != 0;
+ bool Op2IsZeroImm =
+ Def.getOperand(2).isImm() && Def.getOperand(2).getImm() == 0;
+ if (!Op1IsNonZeroImm || !Op2IsZeroImm)
+ return false;
+ return true;
+}
+
+static bool setsSCCIfResultIsZero(const MachineInstr &Def, bool &NeedInversion,
+ unsigned &NewDefOpc) {
+ // S_ADD_U32 X, 1 sets SCC on carryout which can only happen if result==0.
+ // S_ADD_I32 X, 1 can be converted to S_ADD_U32 X, 1 if SCC is dead.
+ if (Def.getOpcode() != AMDGPU::S_ADD_I32 &&
+ Def.getOpcode() != AMDGPU::S_ADD_U32)
+ return false;
+ const MachineOperand &AddSrc1 = Def.getOperand(1);
+ const MachineOperand &AddSrc2 = Def.getOperand(2);
+ int64_t addend;
+
+ if ((!AddSrc1.isImm() || AddSrc1.getImm() != 1) &&
+ (!AddSrc2.isImm() || AddSrc2.getImm() != 1) &&
+ (!getFoldableImm(&AddSrc1, addend) || addend != 1) &&
+ (!getFoldableImm(&AddSrc2, addend) || addend != 1))
+ return false;
+
+ if (Def.getOpcode() == AMDGPU::S_ADD_I32) {
+ const MachineOperand *SccDef =
+ Def.findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
+ if (!SccDef->isDead())
+ return false;
+ NewDefOpc = AMDGPU::S_ADD_U32;
+ }
+ NeedInversion = !NeedInversion;
+ return true;
+}
+
bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
Register SrcReg2, int64_t CmpMask,
int64_t CmpValue,
@@ -10565,6 +11046,72 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
return false;
+ const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI,
+ this](bool NeedInversion) -> bool {
+ if (CmpValue != 0)
+ return false;
+
+ MachineInstr *Def = MRI->getVRegDef(SrcReg);
+ if (!Def)
+ return false;
+
+ // For S_OP that set SCC = DST!=0, do the transformation
+ //
+ // s_cmp_[lg|eq]_* (S_OP ...), 0 => (S_OP ...)
+ //
+ // For (S_OP ...) that set SCC = DST==0, invert NeedInversion and
+ // do the transformation:
+ //
+ // s_cmp_[lg|eq]_* (S_OP ...), 0 => (S_OP ...)
+ //
+ // If foldableSelect, s_cmp_lg_* is redundant because the SCC input value
+ // for S_CSELECT* already has the same value that will be calculated by
+ // s_cmp_lg_*
+ //
+ // s_cmp_[lg|eq]_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT*
+ // (non-zero imm), 0)
+
+ unsigned NewDefOpc = Def->getOpcode();
+ if (!setsSCCIfResultIsNonZero(*Def) &&
+ !setsSCCIfResultIsZero(*Def, NeedInversion, NewDefOpc) &&
+ !foldableSelect(*Def))
+ return false;
+
+ if (!optimizeSCC(Def, &CmpInstr, NeedInversion))
+ return false;
+
+ if (NewDefOpc != Def->getOpcode())
+ Def->setDesc(get(NewDefOpc));
+
+ // If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit
+ // s_cmp_lg of a register pair) and the inputs are the hi and lo-halves of a
+ // 64-bit foldableSelect then delete s_or_b32 in the sequence:
+ // sX = s_cselect_b64 (non-zero imm), 0
+ // sLo = copy sX.sub0
+ // sHi = copy sX.sub1
+ // sY = s_or_b32 sLo, sHi
+ if (Def->getOpcode() == AMDGPU::S_OR_B32 &&
+ MRI->use_nodbg_empty(Def->getOperand(0).getReg())) {
+ const MachineOperand &OrOpnd1 = Def->getOperand(1);
+ const MachineOperand &OrOpnd2 = Def->getOperand(2);
+ if (OrOpnd1.isReg() && OrOpnd2.isReg()) {
+ MachineInstr *Def1 = MRI->getVRegDef(OrOpnd1.getReg());
+ MachineInstr *Def2 = MRI->getVRegDef(OrOpnd2.getReg());
+ if (Def1 && Def1->getOpcode() == AMDGPU::COPY && Def2 &&
+ Def2->getOpcode() == AMDGPU::COPY && Def1->getOperand(1).isReg() &&
+ Def2->getOperand(1).isReg() &&
+ Def1->getOperand(1).getSubReg() == AMDGPU::sub0 &&
+ Def2->getOperand(1).getSubReg() == AMDGPU::sub1 &&
+ Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) {
+ MachineInstr *Select = MRI->getVRegDef(Def1->getOperand(1).getReg());
+ if (Select && foldableSelect(*Select))
+ optimizeSCC(Select, Def, /*NeedInversion=*/false);
+ }
+ }
+ }
+ return true;
+ };
+
const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
this](int64_t ExpectedValue, unsigned SrcSize,
bool IsReversible, bool IsSigned) -> bool {
@@ -10591,8 +11138,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
// s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
// s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
- MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
- if (!Def || Def->getParent() != CmpInstr.getParent())
+ MachineInstr *Def = MRI->getVRegDef(SrcReg);
+ if (!Def)
return false;
if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
@@ -10639,17 +11186,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
return false;
- for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
- I != E; ++I) {
- if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
- I->killsRegister(AMDGPU::SCC, &RI))
- return false;
- }
-
- MachineOperand *SccDef =
- Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
- SccDef->setIsDead(false);
- CmpInstr.eraseFromParent();
+ if (!optimizeSCC(Def, &CmpInstr, /*NeedInversion=*/false))
+ return false;
if (!MRI->use_nodbg_empty(DefReg)) {
assert(!IsReversedCC);
@@ -10679,7 +11217,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
case AMDGPU::S_CMP_EQ_I32:
case AMDGPU::S_CMPK_EQ_U32:
case AMDGPU::S_CMPK_EQ_I32:
- return optimizeCmpAnd(1, 32, true, false);
+ return optimizeCmpAnd(1, 32, true, false) ||
+ optimizeCmpSelect(/*NeedInversion=*/true);
case AMDGPU::S_CMP_GE_U32:
case AMDGPU::S_CMPK_GE_U32:
return optimizeCmpAnd(1, 32, false, false);
@@ -10692,7 +11231,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
case AMDGPU::S_CMP_LG_I32:
case AMDGPU::S_CMPK_LG_U32:
case AMDGPU::S_CMPK_LG_I32:
- return optimizeCmpAnd(0, 32, true, false);
+ return optimizeCmpAnd(0, 32, true, false) ||
+ optimizeCmpSelect(/*NeedInversion=*/false);
case AMDGPU::S_CMP_GT_U32:
case AMDGPU::S_CMPK_GT_U32:
return optimizeCmpAnd(0, 32, false, false);
@@ -10700,7 +11240,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
case AMDGPU::S_CMPK_GT_I32:
return optimizeCmpAnd(0, 32, false, true);
case AMDGPU::S_CMP_LG_U64:
- return optimizeCmpAnd(0, 64, true, false);
+ return optimizeCmpAnd(0, 64, true, false) ||
+ optimizeCmpSelect(/*NeedInversion=*/false);
}
return false;
@@ -10731,7 +11272,7 @@ void SIInstrInfo::enforceOperandRCAlignment(MachineInstr &MI,
MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
: &AMDGPU::VReg_64_Align2RegClass);
BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
- .addReg(DataReg, 0, Op.getSubReg())
+ .addReg(DataReg, {}, Op.getSubReg())
.addImm(AMDGPU::sub0)
.addReg(Undef)
.addImm(AMDGPU::sub1);
@@ -10751,7 +11292,7 @@ bool SIInstrInfo::isXDLWMMA(const MachineInstr &MI) const {
if (!isWMMA(MI) && !isSWMMAC(MI))
return false;
- if (AMDGPU::isGFX1250(ST))
+ if (ST.hasGFX1250Insts())
return AMDGPU::getWMMAIsXDL(MI.getOpcode());
return true;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index e979eeb..0b54513 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -52,6 +52,11 @@ static const MachineMemOperand::Flags MOLastUse =
static const MachineMemOperand::Flags MOCooperative =
MachineMemOperand::MOTargetFlag3;
+/// Mark the MMO of accesses to memory locations that are
+/// never written to by other threads.
+static const MachineMemOperand::Flags MOThreadPrivate =
+ MachineMemOperand::MOTargetFlag4;
+
/// Utility to store machine instructions worklist.
struct SIInstrWorklist {
SIInstrWorklist() = default;
@@ -88,6 +93,8 @@ private:
};
class SIInstrInfo final : public AMDGPUGenInstrInfo {
+ struct ThreeAddressUpdates;
+
private:
const SIRegisterInfo RI;
const GCNSubtarget &ST;
@@ -123,6 +130,11 @@ public:
unsigned SubIdx, const TargetRegisterClass *SubRC) const;
private:
+ bool optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine,
+ bool NeedInversion) const;
+
+ bool invertSCCUse(MachineInstr *SCCDef) const;
+
void swapOperands(MachineInstr &Inst) const;
std::pair<bool, MachineBasicBlock *>
@@ -134,6 +146,8 @@ private:
void lowerScalarAbs(SIInstrWorklist &Worklist, MachineInstr &Inst) const;
+ void lowerScalarAbsDiff(SIInstrWorklist &Worklist, MachineInstr &Inst) const;
+
void lowerScalarXnor(SIInstrWorklist &Worklist, MachineInstr &Inst) const;
void splitScalarNotBinop(SIInstrWorklist &Worklist, MachineInstr &Inst,
@@ -170,7 +184,7 @@ private:
void addUsersToMoveToVALUWorklist(Register Reg, MachineRegisterInfo &MRI,
SIInstrWorklist &Worklist) const;
- void addSCCDefUsersToVALUWorklist(MachineOperand &Op,
+ void addSCCDefUsersToVALUWorklist(const MachineOperand &Op,
MachineInstr &SCCDefInst,
SIInstrWorklist &Worklist,
Register NewCond = Register()) const;
@@ -190,6 +204,9 @@ private:
bool resultDependsOnExec(const MachineInstr &MI) const;
+ MachineInstr *convertToThreeAddressImpl(MachineInstr &MI,
+ ThreeAddressUpdates &Updates) const;
+
protected:
/// If the specific machine instruction is a instruction that moves/copies
/// value from one register to another register return destination and source
@@ -291,6 +308,8 @@ public:
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg,
int64_t &ImmVal) const override;
+ std::optional<int64_t> getImmOrMaterializedImm(MachineOperand &Op) const;
+
unsigned getVectorRegSpillSaveOpcode(Register Reg,
const TargetRegisterClass *RC,
unsigned Size,
@@ -302,22 +321,20 @@ public:
void storeRegToStackSlot(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
- bool isKill, int FrameIndex, const TargetRegisterClass *RC,
- const TargetRegisterInfo *TRI, Register VReg,
+ bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
void loadRegFromStackSlot(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg,
- int FrameIndex, const TargetRegisterClass *RC,
- const TargetRegisterInfo *TRI, Register VReg,
+ int FrameIndex, const TargetRegisterClass *RC, Register VReg,
+ unsigned SubReg = 0,
MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
bool expandPostRAPseudo(MachineInstr &MI) const override;
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
Register DestReg, unsigned SubIdx,
- const MachineInstr &Orig,
- const TargetRegisterInfo &TRI) const override;
+ const MachineInstr &Orig) const override;
// Splits a V_MOV_B64_DPP_PSEUDO opcode into a pair of v_mov_b32_dpp
// instructions. Returns a pair of generated instructions.
@@ -421,6 +438,9 @@ public:
void removeModOperands(MachineInstr &MI) const;
+ void mutateAndCleanupImplicit(MachineInstr &MI,
+ const MCInstrDesc &NewDesc) const;
+
/// Return the extracted immediate value in a subregister use from a constant
/// materialized in a super register.
///
@@ -446,7 +466,7 @@ public:
return MI.getDesc().TSFlags & SIInstrFlags::SALU;
}
- bool isSALU(uint16_t Opcode) const {
+ bool isSALU(uint32_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::SALU;
}
@@ -454,7 +474,7 @@ public:
return MI.getDesc().TSFlags & SIInstrFlags::VALU;
}
- bool isVALU(uint16_t Opcode) const {
+ bool isVALU(uint32_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::VALU;
}
@@ -462,7 +482,7 @@ public:
return isMIMG(MI) || isVSAMPLE(MI) || isVIMAGE(MI);
}
- bool isImage(uint16_t Opcode) const {
+ bool isImage(uint32_t Opcode) const {
return isMIMG(Opcode) || isVSAMPLE(Opcode) || isVIMAGE(Opcode);
}
@@ -470,7 +490,7 @@ public:
return isMUBUF(MI) || isMTBUF(MI) || isImage(MI) || isFLAT(MI);
}
- bool isVMEM(uint16_t Opcode) const {
+ bool isVMEM(uint32_t Opcode) const {
return isMUBUF(Opcode) || isMTBUF(Opcode) || isImage(Opcode);
}
@@ -478,7 +498,7 @@ public:
return MI.getDesc().TSFlags & SIInstrFlags::SOP1;
}
- bool isSOP1(uint16_t Opcode) const {
+ bool isSOP1(uint32_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::SOP1;
}
@@ -486,7 +506,7 @@ public:
return MI.getDesc().TSFlags & SIInstrFlags::SOP2;
}
- bool isSOP2(uint16_t Opcode) const {
+ bool isSOP2(uint32_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::SOP2;
}
@@ -494,7 +514,7 @@ public:
return MI.getDesc().TSFlags & SIInstrFlags::SOPC;
}
- bool isSOPC(uint16_t Opcode) const {
+ bool isSOPC(uint32_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::SOPC;
}
@@ -502,7 +522,7 @@ public:
return MI.getDesc().TSFlags & SIInstrFlags::SOPK;
}
- bool isSOPK(uint16_t Opcode) const {
+ bool isSOPK(uint32_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::SOPK;
}
@@ -510,7 +530,7 @@ public:
return MI.getDesc().TSFlags & SIInstrFlags::SOPP;
}
- bool isSOPP(uint16_t Opcode) const {
+ bool isSOPP(uint32_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::SOPP;
}
@@ -518,7 +538,7 @@ public:
return MI.getDesc().TSFlags & SIInstrFlags::IsPacked;
}
- bool isPacked(uint16_t Opcode) const {
+ bool isPacked(uint32_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::IsPacked;
}
@@ -526,7 +546,7 @@ public:
return MI.getDesc().TSFlags & SIInstrFlags::VOP1;
}
- bool isVOP1(uint16_t Opcode) const {
+ bool isVOP1(uint32_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::VOP1;
}
@@ -534,7 +554,7 @@ public:
return MI.getDesc().TSFlags & SIInstrFlags::VOP2;
}
- bool isVOP2(uint16_t Opcode) const {
+ bool isVOP2(uint32_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::VOP2;
}
@@ -544,13 +564,13 @@ public:
static bool isVOP3(const MachineInstr &MI) { return isVOP3(MI.getDesc()); }
- bool isVOP3(uint16_t Opcode) const { return isVOP3(get(Opcode)); }
+ bool isVOP3(uint32_t Opcode) const { return isVOP3(get(Opcode)); }
static bool isSDWA(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::SDWA;
}
- bool isSDWA(uint16_t Opcode) const {
+ bool isSDWA(uint32_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::SDWA;
}
@@ -558,7 +578,7 @@ public:
return MI.getDesc().TSFlags & SIInstrFlags::VOPC;
}
- bool isVOPC(uint16_t Opcode) const {
+ bool isVOPC(uint32_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::VOPC;
}
@@ -566,7 +586,7 @@ public:
return MI.getDesc().TSFlags & SIInstrFlags::MUBUF;
}
- bool isMUBUF(uint16_t Opcode) const {
+ bool isMUBUF(uint32_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::MUBUF;
}
@@ -574,15 +594,19 @@ public:
return MI.getDesc().TSFlags & SIInstrFlags::MTBUF;
}
- bool isMTBUF(uint16_t Opcode) const {
+ bool isMTBUF(uint32_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::MTBUF;
}
+ static bool isBUF(const MachineInstr &MI) {
+ return isMUBUF(MI) || isMTBUF(MI);
+ }
+
static bool isSMRD(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::SMRD;
}
- bool isSMRD(uint16_t Opcode) const {
+ bool isSMRD(uint32_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::SMRD;
}
@@ -592,33 +616,35 @@ public:
return MI.getDesc().TSFlags & SIInstrFlags::DS;
}
- bool isDS(uint16_t Opcode) const {
+ bool isDS(uint32_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::DS;
}
static bool isLDSDMA(const MachineInstr &MI) {
- return isVALU(MI) && (isMUBUF(MI) || isFLAT(MI));
+ return (isVALU(MI) && (isMUBUF(MI) || isFLAT(MI))) ||
+ (MI.getDesc().TSFlags & SIInstrFlags::TENSOR_CNT);
}
- bool isLDSDMA(uint16_t Opcode) {
- return isVALU(Opcode) && (isMUBUF(Opcode) || isFLAT(Opcode));
+ bool isLDSDMA(uint32_t Opcode) {
+ return (isVALU(Opcode) && (isMUBUF(Opcode) || isFLAT(Opcode))) ||
+ (get(Opcode).TSFlags & SIInstrFlags::TENSOR_CNT);
}
static bool isGWS(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::GWS;
}
- bool isGWS(uint16_t Opcode) const {
+ bool isGWS(uint32_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::GWS;
}
- bool isAlwaysGDS(uint16_t Opcode) const;
+ bool isAlwaysGDS(uint32_t Opcode) const;
static bool isMIMG(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::MIMG;
}
- bool isMIMG(uint16_t Opcode) const {
+ bool isMIMG(uint32_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::MIMG;
}
@@ -626,7 +652,7 @@ public:
return MI.getDesc().TSFlags & SIInstrFlags::VIMAGE;
}
- bool isVIMAGE(uint16_t Opcode) const {
+ bool isVIMAGE(uint32_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::VIMAGE;
}
@@ -634,7 +660,7 @@ public:
return MI.getDesc().TSFlags & SIInstrFlags::VSAMPLE;
}
- bool isVSAMPLE(uint16_t Opcode) const {
+ bool isVSAMPLE(uint32_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::VSAMPLE;
}
@@ -642,7 +668,7 @@ public:
return MI.getDesc().TSFlags & SIInstrFlags::Gather4;
}
- bool isGather4(uint16_t Opcode) const {
+ bool isGather4(uint32_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::Gather4;
}
@@ -657,7 +683,7 @@ public:
return Flags & (SIInstrFlags::FlatGlobal | SIInstrFlags::FlatScratch);
}
- bool isSegmentSpecificFLAT(uint16_t Opcode) const {
+ bool isSegmentSpecificFLAT(uint32_t Opcode) const {
auto Flags = get(Opcode).TSFlags;
return Flags & (SIInstrFlags::FlatGlobal | SIInstrFlags::FlatScratch);
}
@@ -666,7 +692,7 @@ public:
return MI.getDesc().TSFlags & SIInstrFlags::FlatGlobal;
}
- bool isFLATGlobal(uint16_t Opcode) const {
+ bool isFLATGlobal(uint32_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::FlatGlobal;
}
@@ -674,20 +700,20 @@ public:
return MI.getDesc().TSFlags & SIInstrFlags::FlatScratch;
}
- bool isFLATScratch(uint16_t Opcode) const {
+ bool isFLATScratch(uint32_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::FlatScratch;
}
// Any FLAT encoded instruction, including global_* and scratch_*.
- bool isFLAT(uint16_t Opcode) const {
+ bool isFLAT(uint32_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::FLAT;
}
- /// \returns true for SCRATCH_ instructions, or FLAT_ instructions with
- /// SCRATCH_ memory operands.
+ /// \returns true for SCRATCH_ instructions, or FLAT/BUF instructions unless
+ /// the MMOs do not include scratch.
/// Conservatively correct; will return true if \p MI cannot be proven
/// to not hit scratch.
- bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
+ bool mayAccessScratch(const MachineInstr &MI) const;
/// \returns true for FLAT instructions that can access VMEM.
bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
@@ -695,7 +721,7 @@ public:
/// \returns true for FLAT instructions that can access LDS.
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
- static bool isBlockLoadStore(uint16_t Opcode) {
+ static bool isBlockLoadStore(uint32_t Opcode) {
switch (Opcode) {
case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE:
case AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE:
@@ -709,6 +735,52 @@ public:
}
}
+ static bool setsSCCIfResultIsNonZero(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case AMDGPU::S_ABSDIFF_I32:
+ case AMDGPU::S_ABS_I32:
+ case AMDGPU::S_AND_B32:
+ case AMDGPU::S_AND_B64:
+ case AMDGPU::S_ANDN2_B32:
+ case AMDGPU::S_ANDN2_B64:
+ case AMDGPU::S_ASHR_I32:
+ case AMDGPU::S_ASHR_I64:
+ case AMDGPU::S_BCNT0_I32_B32:
+ case AMDGPU::S_BCNT0_I32_B64:
+ case AMDGPU::S_BCNT1_I32_B32:
+ case AMDGPU::S_BCNT1_I32_B64:
+ case AMDGPU::S_BFE_I32:
+ case AMDGPU::S_BFE_I64:
+ case AMDGPU::S_BFE_U32:
+ case AMDGPU::S_BFE_U64:
+ case AMDGPU::S_LSHL_B32:
+ case AMDGPU::S_LSHL_B64:
+ case AMDGPU::S_LSHR_B32:
+ case AMDGPU::S_LSHR_B64:
+ case AMDGPU::S_NAND_B32:
+ case AMDGPU::S_NAND_B64:
+ case AMDGPU::S_NOR_B32:
+ case AMDGPU::S_NOR_B64:
+ case AMDGPU::S_NOT_B32:
+ case AMDGPU::S_NOT_B64:
+ case AMDGPU::S_OR_B32:
+ case AMDGPU::S_OR_B64:
+ case AMDGPU::S_ORN2_B32:
+ case AMDGPU::S_ORN2_B64:
+ case AMDGPU::S_QUADMASK_B32:
+ case AMDGPU::S_QUADMASK_B64:
+ case AMDGPU::S_WQM_B32:
+ case AMDGPU::S_WQM_B64:
+ case AMDGPU::S_XNOR_B32:
+ case AMDGPU::S_XNOR_B64:
+ case AMDGPU::S_XOR_B32:
+ case AMDGPU::S_XOR_B64:
+ return true;
+ default:
+ return false;
+ }
+ }
+
static bool isEXP(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::EXP;
}
@@ -721,7 +793,7 @@ public:
Target == AMDGPU::Exp::ET_DUAL_SRC_BLEND1;
}
- bool isEXP(uint16_t Opcode) const {
+ bool isEXP(uint32_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::EXP;
}
@@ -729,7 +801,7 @@ public:
return MI.getDesc().TSFlags & SIInstrFlags::IsAtomicNoRet;
}
- bool isAtomicNoRet(uint16_t Opcode) const {
+ bool isAtomicNoRet(uint32_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::IsAtomicNoRet;
}
@@ -737,7 +809,7 @@ public:
return MI.getDesc().TSFlags & SIInstrFlags::IsAtomicRet;
}
- bool isAtomicRet(uint16_t Opcode) const {
+ bool isAtomicRet(uint32_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::IsAtomicRet;
}
@@ -746,13 +818,17 @@ public:
SIInstrFlags::IsAtomicNoRet);
}
- bool isAtomic(uint16_t Opcode) const {
+ bool isAtomic(uint32_t Opcode) const {
return get(Opcode).TSFlags & (SIInstrFlags::IsAtomicRet |
SIInstrFlags::IsAtomicNoRet);
}
static bool mayWriteLDSThroughDMA(const MachineInstr &MI) {
- return isLDSDMA(MI) && MI.getOpcode() != AMDGPU::BUFFER_STORE_LDS_DWORD;
+ unsigned Opc = MI.getOpcode();
+ // Exclude instructions that read FROM LDS (not write to it)
+ return isLDSDMA(MI) && Opc != AMDGPU::BUFFER_STORE_LDS_DWORD &&
+ Opc != AMDGPU::TENSOR_STORE_FROM_LDS &&
+ Opc != AMDGPU::TENSOR_STORE_FROM_LDS_D2;
}
static bool isSBarrierSCCWrite(unsigned Opcode) {
@@ -771,7 +847,7 @@ public:
return MI.getDesc().TSFlags & SIInstrFlags::WQM;
}
- bool isWQM(uint16_t Opcode) const {
+ bool isWQM(uint32_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::WQM;
}
@@ -779,7 +855,7 @@ public:
return MI.getDesc().TSFlags & SIInstrFlags::DisableWQM;
}
- bool isDisableWQM(uint16_t Opcode) const {
+ bool isDisableWQM(uint32_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::DisableWQM;
}
@@ -794,7 +870,7 @@ public:
(isSpill(MI) && isVALU(MI));
}
- bool isVGPRSpill(uint16_t Opcode) const {
+ bool isVGPRSpill(uint32_t Opcode) const {
return Opcode != AMDGPU::SI_SPILL_S32_TO_VGPR &&
Opcode != AMDGPU::SI_RESTORE_S32_FROM_VGPR &&
(isSpill(Opcode) && isVALU(Opcode));
@@ -806,13 +882,13 @@ public:
(isSpill(MI) && isSALU(MI));
}
- bool isSGPRSpill(uint16_t Opcode) const {
+ bool isSGPRSpill(uint32_t Opcode) const {
return Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR ||
Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
(isSpill(Opcode) && isSALU(Opcode));
}
- bool isSpill(uint16_t Opcode) const {
+ bool isSpill(uint32_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::Spill;
}
@@ -822,7 +898,7 @@ public:
static bool isSpill(const MachineInstr &MI) { return isSpill(MI.getDesc()); }
- static bool isWWMRegSpillOpcode(uint16_t Opcode) {
+ static bool isWWMRegSpillOpcode(uint32_t Opcode) {
return Opcode == AMDGPU::SI_SPILL_WWM_V32_SAVE ||
Opcode == AMDGPU::SI_SPILL_WWM_AV32_SAVE ||
Opcode == AMDGPU::SI_SPILL_WWM_V32_RESTORE ||
@@ -838,7 +914,7 @@ public:
return MI.getDesc().TSFlags & SIInstrFlags::DPP;
}
- bool isDPP(uint16_t Opcode) const {
+ bool isDPP(uint32_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::DPP;
}
@@ -846,7 +922,7 @@ public:
return MI.getDesc().TSFlags & SIInstrFlags::TRANS;
}
- bool isTRANS(uint16_t Opcode) const {
+ bool isTRANS(uint32_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::TRANS;
}
@@ -854,7 +930,7 @@ public:
return MI.getDesc().TSFlags & SIInstrFlags::VOP3P;
}
- bool isVOP3P(uint16_t Opcode) const {
+ bool isVOP3P(uint32_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::VOP3P;
}
@@ -862,7 +938,7 @@ public:
return MI.getDesc().TSFlags & SIInstrFlags::VINTRP;
}
- bool isVINTRP(uint16_t Opcode) const {
+ bool isVINTRP(uint32_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::VINTRP;
}
@@ -872,13 +948,18 @@ public:
static bool isMAI(const MachineInstr &MI) { return isMAI(MI.getDesc()); }
- bool isMAI(uint16_t Opcode) const { return isMAI(get(Opcode)); }
+ bool isMAI(uint32_t Opcode) const { return isMAI(get(Opcode)); }
static bool isMFMA(const MachineInstr &MI) {
return isMAI(MI) && MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
}
+ bool isMFMA(uint32_t Opcode) const {
+ return isMAI(Opcode) && Opcode != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
+ Opcode != AMDGPU::V_ACCVGPR_READ_B32_e64;
+ }
+
static bool isDOT(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::IsDOT;
}
@@ -887,7 +968,7 @@ public:
return MI.getDesc().TSFlags & SIInstrFlags::IsWMMA;
}
- bool isWMMA(uint16_t Opcode) const {
+ bool isWMMA(uint32_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::IsWMMA;
}
@@ -895,15 +976,19 @@ public:
return isMFMA(MI) || isWMMA(MI) || isSWMMAC(MI);
}
+ bool isMFMAorWMMA(uint32_t Opcode) const {
+ return isMFMA(Opcode) || isWMMA(Opcode) || isSWMMAC(Opcode);
+ }
+
static bool isSWMMAC(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::IsSWMMAC;
}
- bool isSWMMAC(uint16_t Opcode) const {
+ bool isSWMMAC(uint32_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::IsSWMMAC;
}
- bool isDOT(uint16_t Opcode) const {
+ bool isDOT(uint32_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::IsDOT;
}
@@ -917,7 +1002,7 @@ public:
return MI.getDesc().TSFlags & SIInstrFlags::LDSDIR;
}
- bool isLDSDIR(uint16_t Opcode) const {
+ bool isLDSDIR(uint32_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::LDSDIR;
}
@@ -925,7 +1010,7 @@ public:
return MI.getDesc().TSFlags & SIInstrFlags::VINTERP;
}
- bool isVINTERP(uint16_t Opcode) const {
+ bool isVINTERP(uint32_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::VINTERP;
}
@@ -941,6 +1026,14 @@ public:
return MI.getDesc().TSFlags & SIInstrFlags::LGKM_CNT;
}
+ static bool usesASYNC_CNT(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::ASYNC_CNT;
+ }
+
+ bool usesASYNC_CNT(uint32_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::ASYNC_CNT;
+ }
+
// Most sopk treat the immediate as a signed 16-bit, however some
// use it as unsigned.
static bool sopkIsZext(unsigned Opcode) {
@@ -957,7 +1050,7 @@ public:
return MI.getDesc().TSFlags & SIInstrFlags::SCALAR_STORE;
}
- bool isScalarStore(uint16_t Opcode) const {
+ bool isScalarStore(uint32_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::SCALAR_STORE;
}
@@ -965,7 +1058,7 @@ public:
return MI.getDesc().TSFlags & SIInstrFlags::FIXED_SIZE;
}
- bool isFixedSize(uint16_t Opcode) const {
+ bool isFixedSize(uint32_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::FIXED_SIZE;
}
@@ -973,7 +1066,7 @@ public:
return MI.getDesc().TSFlags & SIInstrFlags::FPClamp;
}
- bool hasFPClamp(uint16_t Opcode) const {
+ bool hasFPClamp(uint32_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::FPClamp;
}
@@ -993,7 +1086,7 @@ public:
return MI.getDesc().TSFlags & SIInstrFlags::FPDPRounding;
}
- bool usesFPDPRounding(uint16_t Opcode) const {
+ bool usesFPDPRounding(uint32_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::FPDPRounding;
}
@@ -1001,7 +1094,7 @@ public:
return MI.getDesc().TSFlags & SIInstrFlags::FPAtomic;
}
- bool isFPAtomic(uint16_t Opcode) const {
+ bool isFPAtomic(uint32_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::FPAtomic;
}
@@ -1046,7 +1139,7 @@ public:
return MI.getDesc().TSFlags & SIInstrFlags::TiedSourceNotRead;
}
- bool doesNotReadTiedSource(uint16_t Opcode) const {
+ bool doesNotReadTiedSource(uint32_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::TiedSourceNotRead;
}
@@ -1114,13 +1207,13 @@ public:
bool isVGPRCopy(const MachineInstr &MI) const {
assert(isCopyInstr(MI));
Register Dest = MI.getOperand(0).getReg();
- const MachineFunction &MF = *MI.getParent()->getParent();
+ const MachineFunction &MF = *MI.getMF();
const MachineRegisterInfo &MRI = MF.getRegInfo();
return !RI.isSGPRReg(MRI, Dest);
}
bool hasVGPRUses(const MachineInstr &MI) const {
- const MachineFunction &MF = *MI.getParent()->getParent();
+ const MachineFunction &MF = *MI.getMF();
const MachineRegisterInfo &MRI = MF.getRegInfo();
return llvm::any_of(MI.explicit_uses(),
[&MRI, this](const MachineOperand &MO) {
@@ -1289,7 +1382,7 @@ public:
/// Return the size in bytes of the operand OpNo on the given
// instruction opcode.
- unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const {
+ unsigned getOpSize(uint32_t Opcode, unsigned OpNo) const {
const MCOperandInfo &OpInfo = get(Opcode).operands()[OpNo];
if (OpInfo.RegClass == -1) {
@@ -1501,6 +1594,8 @@ public:
bool isBasicBlockPrologue(const MachineInstr &MI,
Register Reg = Register()) const override;
+ bool canAddToBBProlog(const MachineInstr &MI) const;
+
MachineInstr *createPHIDestinationCopy(MachineBasicBlock &MBB,
MachineBasicBlock::iterator InsPt,
const DebugLoc &DL, Register Src,
@@ -1562,10 +1657,6 @@ public:
/// Return true if this opcode should not be used by codegen.
bool isAsmOnlyOpcode(int MCOp) const;
- const TargetRegisterClass *
- getRegClass(const MCInstrDesc &TID, unsigned OpNum,
- const TargetRegisterInfo *TRI) const override;
-
void fixImplicitOperands(MachineInstr &MI) const;
MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
@@ -1579,22 +1670,21 @@ public:
const MachineInstr &MI,
unsigned *PredCost = nullptr) const override;
+ const MachineOperand &getCalleeOperand(const MachineInstr &MI) const override;
+
InstructionUniformity
- getInstructionUniformity(const MachineInstr &MI) const override final;
+ getInstructionUniformity(const MachineInstr &MI) const final;
InstructionUniformity
getGenericInstructionUniformity(const MachineInstr &MI) const;
- const MIRFormatter *getMIRFormatter() const override {
- if (!Formatter)
- Formatter = std::make_unique<AMDGPUMIRFormatter>();
- return Formatter.get();
- }
+ const MIRFormatter *getMIRFormatter() const override;
static unsigned getDSShaderTypeValue(const MachineFunction &MF);
const TargetSchedModel &getSchedModel() const { return SchedModel; }
+ // FIXME: This should be removed
// Enforce operand's \p OpName even alignment if required by target.
// This is used if an operand is a 32 bit register but needs to be aligned
// regardless.
@@ -1627,7 +1717,7 @@ TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI,
/// skipping copy like instructions and subreg-manipulation pseudos.
/// Following another subreg of a reg:subreg isn't supported.
MachineInstr *getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
- MachineRegisterInfo &MRI);
+ const MachineRegisterInfo &MRI);
/// \brief Return false if EXEC is not changed between the def of \p VReg at \p
/// DefMI and the use at \p UseMI. Should be run on SSA. Currently does not
@@ -1647,86 +1737,86 @@ bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI,
namespace AMDGPU {
LLVM_READONLY
- int getVOPe64(uint16_t Opcode);
+ int64_t getVOPe64(uint32_t Opcode);
LLVM_READONLY
- int getVOPe32(uint16_t Opcode);
+ int64_t getVOPe32(uint32_t Opcode);
LLVM_READONLY
- int getSDWAOp(uint16_t Opcode);
+ int64_t getSDWAOp(uint32_t Opcode);
LLVM_READONLY
- int getDPPOp32(uint16_t Opcode);
+ int64_t getDPPOp32(uint32_t Opcode);
LLVM_READONLY
- int getDPPOp64(uint16_t Opcode);
+ int64_t getDPPOp64(uint32_t Opcode);
LLVM_READONLY
- int getBasicFromSDWAOp(uint16_t Opcode);
+ int64_t getBasicFromSDWAOp(uint32_t Opcode);
LLVM_READONLY
- int getCommuteRev(uint16_t Opcode);
+ int64_t getCommuteRev(uint32_t Opcode);
LLVM_READONLY
- int getCommuteOrig(uint16_t Opcode);
+ int64_t getCommuteOrig(uint32_t Opcode);
LLVM_READONLY
- int getAddr64Inst(uint16_t Opcode);
+ int64_t getAddr64Inst(uint32_t Opcode);
/// Check if \p Opcode is an Addr64 opcode.
///
/// \returns \p Opcode if it is an Addr64 opcode, otherwise -1.
LLVM_READONLY
- int getIfAddr64Inst(uint16_t Opcode);
+ int64_t getIfAddr64Inst(uint32_t Opcode);
LLVM_READONLY
- int getSOPKOp(uint16_t Opcode);
+ int64_t getSOPKOp(uint32_t Opcode);
/// \returns SADDR form of a FLAT Global instruction given an \p Opcode
/// of a VADDR form.
LLVM_READONLY
- int getGlobalSaddrOp(uint16_t Opcode);
+ int64_t getGlobalSaddrOp(uint32_t Opcode);
/// \returns VADDR form of a FLAT Global instruction given an \p Opcode
/// of a SADDR form.
LLVM_READONLY
- int getGlobalVaddrOp(uint16_t Opcode);
+ int64_t getGlobalVaddrOp(uint32_t Opcode);
LLVM_READONLY
- int getVCMPXNoSDstOp(uint16_t Opcode);
+ int64_t getVCMPXNoSDstOp(uint32_t Opcode);
/// \returns ST form with only immediate offset of a FLAT Scratch instruction
/// given an \p Opcode of an SS (SADDR) form.
LLVM_READONLY
- int getFlatScratchInstSTfromSS(uint16_t Opcode);
+ int64_t getFlatScratchInstSTfromSS(uint32_t Opcode);
/// \returns SV (VADDR) form of a FLAT Scratch instruction given an \p Opcode
/// of an SVS (SADDR + VADDR) form.
LLVM_READONLY
- int getFlatScratchInstSVfromSVS(uint16_t Opcode);
+ int64_t getFlatScratchInstSVfromSVS(uint32_t Opcode);
/// \returns SS (SADDR) form of a FLAT Scratch instruction given an \p Opcode
/// of an SV (VADDR) form.
LLVM_READONLY
- int getFlatScratchInstSSfromSV(uint16_t Opcode);
+ int64_t getFlatScratchInstSSfromSV(uint32_t Opcode);
/// \returns SV (VADDR) form of a FLAT Scratch instruction given an \p Opcode
/// of an SS (SADDR) form.
LLVM_READONLY
- int getFlatScratchInstSVfromSS(uint16_t Opcode);
+ int64_t getFlatScratchInstSVfromSS(uint32_t Opcode);
/// \returns earlyclobber version of a MAC MFMA is exists.
LLVM_READONLY
- int getMFMAEarlyClobberOp(uint16_t Opcode);
+ int64_t getMFMAEarlyClobberOp(uint32_t Opcode);
/// \returns Version of an MFMA instruction which uses AGPRs for srcC and
/// vdst, given an \p Opcode of an MFMA which uses VGPRs for srcC/vdst.
LLVM_READONLY
- int getMFMASrcCVDstAGPROp(uint16_t Opcode);
+ int64_t getMFMASrcCVDstAGPROp(uint32_t Opcode);
/// \returns v_cmpx version of a v_cmp instruction.
LLVM_READONLY
- int getVCMPXOpFromVCMP(uint16_t Opcode);
+ int64_t getVCMPXOpFromVCMP(uint32_t Opcode);
const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index b7f63ec..f063b4e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -6,13 +6,6 @@
//
//===----------------------------------------------------------------------===//
-def isWave32 : Predicate<"Subtarget->isWave32()">,
- AssemblerPredicate <(any_of FeatureWavefrontSize32,
- FeatureAssemblerPermissiveWavesize)>;
-def isWave64 : Predicate<"Subtarget->isWave64()">,
- AssemblerPredicate <(any_of FeatureWavefrontSize64,
- FeatureAssemblerPermissiveWavesize)>;
-
class AMDGPUMnemonicAlias<string From, string To, string VariantName = "">
: MnemonicAlias<From, To, VariantName>, PredicateControl;
@@ -34,6 +27,7 @@ def SIEncodingFamily {
int GFX11 = 10;
int GFX12 = 11;
int GFX1250 = 12;
+ int GFX13 = 13;
}
//===----------------------------------------------------------------------===//
@@ -47,6 +41,7 @@ class GFXGen<Predicate pred, string dn, string suffix, int sub> {
int Subtarget = sub;
}
+def GFX13Gen : GFXGen<isGFX13Only, "GFX13", "_gfx13", SIEncodingFamily.GFX13>;
def GFX1250Gen : GFXGen<isGFX125xOnly, "GFX1250", "_gfx1250", SIEncodingFamily.GFX1250>;
def GFX12Not12_50Gen : GFXGen<isGFX12Not12_50, "GFX12", "_gfx12", SIEncodingFamily.GFX12>;
def GFX12Gen : GFXGen<isGFX12Only, "GFX12", "_gfx12", SIEncodingFamily.GFX12>;
@@ -57,6 +52,8 @@ def GFX10Gen : GFXGen<isGFX10Only, "GFX10", "_gfx10", SIEncodingFamily.GFX10>;
// SI DAG Nodes
//===----------------------------------------------------------------------===//
+// CLAMP value between 0.0 and 1.0. NaN clamped to 0, following clamp output
+// modifier behavior with dx10_enable.
def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>;
def SDTSBufferLoad : SDTypeProfile<1, 3,
@@ -331,7 +328,7 @@ def mfma_f32_32x32x64_f8f6f4 : UnscaledMFMAOptimizationPat<int_amdgcn_mfma_scale
//===----------------------------------------------------------------------===//
class isIntType<ValueType SrcVT> {
- bit ret = !and(SrcVT.isInteger, !ne(SrcVT.Value, i1.Value));
+ bit ret = !and(SrcVT.isInteger, !ne(SrcVT, i1));
}
def SDTSBufferPrefetch : SDTypeProfile<0, 3,
@@ -776,11 +773,7 @@ def xnor : PatFrag <
foreach I = 1-4 in {
def shl#I#_add : PatFrag <
(ops node:$src0, node:$src1),
- (add (shl_oneuse $src0, (i32 I)), $src1)> {
- // FIXME: Poor substitute for disabling pattern in SelectionDAG
- let PredicateCode = [{return false;}];
- let GISelPredicateCode = [{return true;}];
-}
+ (add (shl_oneuse $src0, (i32 I)), $src1)>;
}
multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0,
@@ -818,6 +811,8 @@ defm atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">;
defm atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">;
defm atomic_load_uinc_wrap : SIAtomicM0Glue2 <"LOAD_UINC_WRAP">;
defm atomic_load_udec_wrap : SIAtomicM0Glue2 <"LOAD_UDEC_WRAP">;
+defm atomic_load_usub_cond : SIAtomicM0Glue2 <"LOAD_USUB_COND">;
+defm atomic_load_usub_sat : SIAtomicM0Glue2 <"LOAD_USUB_SAT">;
defm atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">;
defm atomic_load_min : SIAtomicM0Glue2 <"LOAD_MIN">;
defm atomic_load_max : SIAtomicM0Glue2 <"LOAD_MAX">;
@@ -963,6 +958,9 @@ def InlineImmFP64 : FPImmLeaf<f64, [{
return isInlineImmediate(Imm);
}]>;
+def fpimm_pos_zero : FPImmLeaf<fAny, [{
+ return Imm.isZero() && !Imm.isNegative();
+}]>;
class VGPRImm <dag frag> : PatLeaf<frag, [{
return isVGPRImm(N);
@@ -991,6 +989,11 @@ def fp16_zeros_high_16bits : PatLeaf<(f16 VGPR_32:$src), [{
return fp16SrcZerosHighBits(N->getOpcode());
}]>;
+
+class build_vector_fpimm_pos_zero_v2<VTVec vec> : PatLeaf<
+ (vec (build_vector (vec.ElementType fpimm_pos_zero),
+ (vec.ElementType fpimm_pos_zero)))>;
+
def MFMALdScaleXForm : SDNodeXForm<timm, [{
unsigned Val = N->getZExtValue();
unsigned New = 0;
@@ -1001,11 +1004,13 @@ def MFMALdScaleXForm : SDNodeXForm<timm, [{
return CurDAG->getTargetConstant(New, SDLoc(N), MVT::i32);
}]>;
-def is_canonicalized : PatLeaf<(fAny srcvalue:$src), [{
+def fcanonicalize_canonicalized
+ : PatFrag<(ops node:$op), (fcanonicalize node:$op), [{
const SITargetLowering &Lowering =
*static_cast<const SITargetLowering *>(getTargetLowering());
- return Lowering.isCanonicalized(*CurDAG, Op);
+ return Lowering.isCanonicalized(*CurDAG, Op->getOperand(0), N->getFlags());
}]> {
+ // FIXME: This predicate for GlobalISel is dead code.
let GISelPredicateCode = [{
const SITargetLowering *TLI = static_cast<const SITargetLowering *>(
MF.getSubtarget().getTargetLowering());
@@ -1084,6 +1089,8 @@ def VReg32OrOffClass : AsmOperandClass {
def SendMsg : CustomOperand<i32>;
+def WaitEvent : CustomOperand<i16>;
+
def Swizzle : CustomOperand<i16, 1>;
def Endpgm : CustomOperand<i16, 1>;
@@ -1197,12 +1204,12 @@ class NamedIntOperand<string prefix, bit Optional = 1, string name = NAME>
!if(AlwaysPrint, "true", "false")#"); }";
}
-class NamedBitOperand<string Id, string Name = NAME>
+class NamedBitOperand<string Id, string Name = NAME, bit AlwaysIgnoreNegative = 0>
: CustomOperand<i1, 1, Name> {
let PredicateMethod = "isImmTy<AMDGPUOperand::"#ImmTy#">";
let ParserMethod =
"[this](OperandVector &Operands) -> ParseStatus { "#
- "return parseNamedBit(\""#Id#"\", Operands, AMDGPUOperand::"#ImmTy#"); }";
+ "return parseNamedBit(\""#Id#"\", Operands, AMDGPUOperand::"#ImmTy# !if(AlwaysIgnoreNegative, ", true", ", false")#"); }";
let PrintMethod = "[this](const MCInst *MI, unsigned OpNo, "#
"const MCSubtargetInfo &STI, raw_ostream &O) { "#
"printNamedBit(MI, OpNo, O, \""#Id#"\"); }";
@@ -1260,6 +1267,8 @@ def CPol_NonGLC : ValuePredicatedOperand<CPol, "!(Op.getImm() & CPol::GLC)", 1>;
def CPol_GLC_WithDefault : DefaultOperand<CPol_GLC, !shl(1, CPolBit.GLC)>;
def CPol_NonGLC_WithDefault : DefaultOperand<CPol_NonGLC, 0>;
+def IsAsync : NamedBitOperand<"isasync">;
+
def TFE : NamedBitOperand<"tfe">;
def UNorm : NamedBitOperand<"unorm">;
def DA : NamedBitOperand<"da">;
@@ -1267,8 +1276,10 @@ def R128A16 : CustomOperand<i1, 1>;
def A16 : NamedBitOperand<"a16">;
def D16 : NamedBitOperand<"d16">;
def LWE : NamedBitOperand<"lwe">;
-def exp_compr : NamedBitOperand<"compr", "ExpCompr">;
-def exp_vm : NamedBitOperand<"vm", "ExpVM">;
+def exp_compr : NamedBitOperand<"compr", "ExpCompr", 1>;
+def exp_vm : NamedBitOperand<"vm", "ExpVM", 1>;
+def exp_done : NamedBitOperand<"done", "Done", 1>;
+def exp_row_en : NamedBitOperand<"row_en", "RowEn", 1>;
def FORMAT : CustomOperand<i8>;
@@ -1796,10 +1807,10 @@ class SIMCInstr <string pseudo, int subtarget> {
class getNumSrcArgs<ValueType Src0, ValueType Src1, ValueType Src2> {
int ret =
- !if (!eq(Src0.Value, untyped.Value), 0,
- !if (!eq(Src1.Value, untyped.Value), 1, // VOP1
- !if (!eq(Src2.Value, untyped.Value), 2, // VOP2
- 3))); // VOP3
+ !if (!eq(Src0, untyped), 0,
+ !if (!eq(Src1, untyped), 1, // VOP1
+ !if (!eq(Src2, untyped), 2, // VOP2
+ 3))); // VOP3
}
// Returns the register class to use for the destination of VOP[123C]
@@ -1868,17 +1879,17 @@ class getVCSrcForVT<ValueType VT> {
!if(VT.isFP,
!if(!eq(VT.Size, 64),
VCSrc_f64,
- !cond(!eq(VT.Value, f16.Value) : VCSrc_f16,
- !eq(VT.Value, bf16.Value) : VCSrc_bf16,
- !eq(VT.Value, v2f16.Value) : VCSrc_v2f16,
- !eq(VT.Value, v2bf16.Value) : VCSrc_v2bf16,
+ !cond(!eq(VT, f16) : VCSrc_f16,
+ !eq(VT, bf16) : VCSrc_bf16,
+ !eq(VT, v2f16) : VCSrc_v2f16,
+ !eq(VT, v2bf16) : VCSrc_v2bf16,
1 : VCSrc_f32)
),
!if(!eq(VT.Size, 64),
VCSrc_b64,
- !if(!eq(VT.Value, i16.Value),
+ !if(!eq(VT, i16),
VCSrc_b16,
- !if(!eq(VT.Value, v2i16.Value),
+ !if(!eq(VT, v2i16),
VCSrc_v2b16,
VCSrc_b32
)
@@ -2003,28 +2014,28 @@ class getVOP3DPPSrcForVT<ValueType VT, bit IsFake16 = 1> {
// Float or packed int
class isModifierType<ValueType SrcVT> {
- bit ret = !or(!eq(SrcVT.Value, f16.Value),
- !eq(SrcVT.Value, bf16.Value),
- !eq(SrcVT.Value, f32.Value),
- !eq(SrcVT.Value, f64.Value),
- !eq(SrcVT.Value, v2f16.Value),
- !eq(SrcVT.Value, v2i16.Value),
- !eq(SrcVT.Value, v2bf16.Value),
- !eq(SrcVT.Value, v2f32.Value),
- !eq(SrcVT.Value, v2i32.Value),
- !eq(SrcVT.Value, v4f16.Value),
- !eq(SrcVT.Value, v4i16.Value),
- !eq(SrcVT.Value, v4bf16.Value),
- !eq(SrcVT.Value, v4f32.Value),
- !eq(SrcVT.Value, v4i32.Value),
- !eq(SrcVT.Value, v8f16.Value),
- !eq(SrcVT.Value, v8i16.Value),
- !eq(SrcVT.Value, v8bf16.Value),
- !eq(SrcVT.Value, v8f32.Value),
- !eq(SrcVT.Value, v8i32.Value),
- !eq(SrcVT.Value, v16f16.Value),
- !eq(SrcVT.Value, v16i16.Value),
- !eq(SrcVT.Value, v16bf16.Value));
+ bit ret = !or(!eq(SrcVT, f16),
+ !eq(SrcVT, bf16),
+ !eq(SrcVT, f32),
+ !eq(SrcVT, f64),
+ !eq(SrcVT, v2f16),
+ !eq(SrcVT, v2i16),
+ !eq(SrcVT, v2bf16),
+ !eq(SrcVT, v2f32),
+ !eq(SrcVT, v2i32),
+ !eq(SrcVT, v4f16),
+ !eq(SrcVT, v4i16),
+ !eq(SrcVT, v4bf16),
+ !eq(SrcVT, v4f32),
+ !eq(SrcVT, v4i32),
+ !eq(SrcVT, v8f16),
+ !eq(SrcVT, v8i16),
+ !eq(SrcVT, v8bf16),
+ !eq(SrcVT, v8f32),
+ !eq(SrcVT, v8i32),
+ !eq(SrcVT, v16f16),
+ !eq(SrcVT, v16i16),
+ !eq(SrcVT, v16bf16));
}
// Return type of input modifiers operand for specified input operand.
@@ -2057,9 +2068,9 @@ class getSrcModDPP <ValueType VT> {
class getSrcModDPP_t16 <ValueType VT, bit IsFake16 = 1> {
Operand ret =
!if (VT.isFP,
- !if (!or(!eq(VT.Value, f16.Value), !eq(VT.Value, bf16.Value)),
+ !if (!or(!eq(VT, f16), !eq(VT, bf16)),
FPT16_Lo128VRegInputMods<IsFake16>, FPVRegInputMods),
- !if (!eq(VT.Value, i16.Value),
+ !if (!eq(VT, i16),
IntT16_Lo128VRegInputMods<IsFake16>, IntVRegInputMods));
}
@@ -2068,11 +2079,11 @@ class getSrcModDPP_t16 <ValueType VT, bit IsFake16 = 1> {
class getSrcModVOP3VC <ValueType VT, bit IsFake16 = 1> {
Operand ret =
!if (VT.isFP,
- !if (!or(!eq(VT.Value, f16.Value), !eq(VT.Value, bf16.Value)),
+ !if (!or(!eq(VT, f16), !eq(VT, bf16)),
FPT16VCSrcInputMods<IsFake16>,
- !if (!eq(VT.Value, f64.Value), FP64VCSrcInputMods,
+ !if (!eq(VT, f64), FP64VCSrcInputMods,
FP32VCSrcInputMods)),
- !if (!eq(VT.Value, i16.Value),
+ !if (!eq(VT, i16),
IntT16VCSrcInputMods<IsFake16>,
Int32VCSrcInputMods));
}
@@ -2084,15 +2095,15 @@ class getSrcModVOP3VC <ValueType VT, bit IsFake16 = 1> {
class getSrc0ModVOP3DPP <ValueType VT, ValueType DstVT, bit IsFake16 = 1> {
defvar T16Dst =
!if (VT.isFP,
- !if (!or(!eq(VT.Value, f16.Value), !eq(VT.Value, bf16.Value)),
+ !if (!or(!eq(VT, f16), !eq(VT, bf16)),
FPT16VRegInputMods<IsFake16>, FPVRegT16DstInputMods),
- !if (!eq(VT.Value, i16.Value), IntT16VRegInputMods<IsFake16>,
+ !if (!eq(VT, i16), IntT16VRegInputMods<IsFake16>,
IntVRegT16DstInputMods));
defvar Normal =
!if (VT.isFP,
- !if (!or(!eq(VT.Value, f16.Value), !eq(VT.Value, bf16.Value)),
+ !if (!or(!eq(VT, f16), !eq(VT, bf16)),
FPT16VRegInputMods<IsFake16>, FPVRegInputMods),
- !if (!eq(VT.Value, i16.Value),
+ !if (!eq(VT, i16),
IntT16VRegInputMods<IsFake16>,
IntVRegInputMods));
Operand ret = !if(!and(!not(IsFake16), !eq(DstVT.Size, 16)), T16Dst, Normal);
@@ -2102,16 +2113,16 @@ class getSrc0ModVOP3DPP <ValueType VT, ValueType DstVT, bit IsFake16 = 1> {
// only operands (VOPD3 vsrc1 and vsrc2).
class getSrcModVOP3V <ValueType VT> {
Operand ret =
- !if (!eq(VT.Value, f64.Value), FP64VRegSrcInputMods,
+ !if (!eq(VT, f64), FP64VRegSrcInputMods,
FP32VRegSrcInputMods);
}
// Return type of input modifiers operand specified input operand for SDWA
class getSrcModSDWA <ValueType VT> {
- Operand ret = !if(!eq(VT.Value, f16.Value), FP16SDWAInputMods,
- !if(!eq(VT.Value, f32.Value), FP32SDWAInputMods,
- !if(!eq(VT.Value, i16.Value), Int16SDWAInputMods,
- !if(!eq(VT.Value, bf16.Value), FP16SDWAInputMods,
+ Operand ret = !if(!eq(VT, f16), FP16SDWAInputMods,
+ !if(!eq(VT, f32), FP32SDWAInputMods,
+ !if(!eq(VT, i16), Int16SDWAInputMods,
+ !if(!eq(VT, bf16), FP16SDWAInputMods,
Int32SDWAInputMods))));
}
@@ -2778,14 +2789,14 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
field bit HasFP8ByteSel = !or(HasFP8SrcByteSel, HasFP8DstByteSel);
field bit HasBitOp3 = 0;
- field bit HasDst = !ne(DstVT.Value, untyped.Value);
+ field bit HasDst = !ne(DstVT, untyped);
field bit HasDst32 = HasDst;
field bit EmitDst = HasDst; // force dst encoding, see v_movreld_b32 special case
field bit EmitDstSel = EmitDst;
field int NumSrcArgs = getNumSrcArgs<Src0VT, Src1VT, Src2VT>.ret;
- field bit HasSrc0 = !ne(Src0VT.Value, untyped.Value);
- field bit HasSrc1 = !ne(Src1VT.Value, untyped.Value);
- field bit HasSrc2 = !ne(Src2VT.Value, untyped.Value);
+ field bit HasSrc0 = !ne(Src0VT, untyped);
+ field bit HasSrc1 = !ne(Src1VT, untyped);
+ field bit HasSrc2 = !ne(Src2VT, untyped);
field bit HasSrc0FloatMods = Src0VT.isFP;
field bit HasSrc1FloatMods = Src1VT.isFP;
@@ -3364,7 +3375,8 @@ def getMCOpcodeGen : InstrMapping {
[!cast<string>(SIEncodingFamily.GFX940)],
[!cast<string>(SIEncodingFamily.GFX11)],
[!cast<string>(SIEncodingFamily.GFX12)],
- [!cast<string>(SIEncodingFamily.GFX1250)]];
+ [!cast<string>(SIEncodingFamily.GFX1250)],
+ [!cast<string>(SIEncodingFamily.GFX13)]];
}
// Get equivalent SOPK instruction.
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 27e5ee9c..cde3523 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -131,7 +131,7 @@ def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst),
// 64-bit vector move instruction. This is mainly used by the
// SIFoldOperands pass to enable folding of inline immediates.
-def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64_AlignTarget:$vdst),
+def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
(ins VSrc_b64:$src0)> {
let isReMaterializable = 1;
let isAsCheapAsAMove = 1;
@@ -328,7 +328,7 @@ def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
multiclass
AMDGPUWaveReducePseudoGenerator<string Op, string DataType, ValueType ty, RegisterClass RetReg, SrcRegOrImm9 Reg> {
- let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
+ let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, UseNamedOperandTable = 1, Uses = [EXEC] in {
def !toupper(Op) #"_PSEUDO_" #DataType
: VPseudoInstSI<(outs RetReg : $sdst),
(ins Reg : $src, VSrc_b32 : $strategy),
@@ -348,7 +348,7 @@ class WaveReduceOp<string OpName, string TypeStr, ValueType Ty,
// Input list : [Operation_name,
// type - Signed(I)/Unsigned(U)/Float(F)/Bitwise(B),
-// bit-width
+// input-type
// output register class,
// input register class]
defvar Operations = [
@@ -371,6 +371,15 @@ defvar Operations = [
WaveReduceOp<"and", "B64", i64, SGPR_64, VSrc_b64>,
WaveReduceOp<"or", "B64", i64, SGPR_64, VSrc_b64>,
WaveReduceOp<"xor", "B64", i64, SGPR_64, VSrc_b64>,
+
+ WaveReduceOp<"fmin", "F32", f32, SGPR_32, VSrc_b32>,
+ WaveReduceOp<"fmin", "F64", f64, SGPR_64, VSrc_b64>,
+ WaveReduceOp<"fmax", "F32", f32, SGPR_32, VSrc_b32>,
+ WaveReduceOp<"fmax", "F64", f64, SGPR_64, VSrc_b64>,
+ WaveReduceOp<"fadd", "F32", f32, SGPR_32, VSrc_b32>,
+ WaveReduceOp<"fadd", "F64", f64, SGPR_64, VSrc_b64>,
+ WaveReduceOp<"fsub", "F32", f32, SGPR_32, VSrc_b32>,
+ WaveReduceOp<"fsub", "F64", f64, SGPR_64, VSrc_b64>,
];
foreach Op = Operations in {
@@ -791,6 +800,17 @@ def : GCNPat<
(SI_CALL_ISEL $src0, (i64 0))
>;
+// Funnel shift right (fshr) patterns for uniform inputs.
+// These patterns implement this using scalar instructions by constructing a 64-bit
+// value {a, b} and performing a single right shift.
+def : GCNPat<(UniformTernaryFrag<fshr> i32:$src0, i32:$src1, i32:$src2),
+ (i32 (EXTRACT_SUBREG (S_LSHR_B64 (REG_SEQUENCE SReg_64, $src1, sub0, $src0, sub1), (S_AND_B32 $src2, (i32 31))), sub0))
+>;
+
+def : GCNPat<(UniformTernaryFrag<fshr> i32:$src0, i32:$src1, (i32 ShiftAmt32Imm:$src2)),
+ (i32 (EXTRACT_SUBREG (S_LSHR_B64 (REG_SEQUENCE SReg_64, $src1, sub0, $src0, sub1), $src2), sub0))
+>;
+
// Wrapper around s_swappc_b64 with extra $callee parameter to track
// the called function after regalloc.
def SI_CALL : SPseudoInstSI <
@@ -804,9 +824,8 @@ def SI_CALL : SPseudoInstSI <
let isConvergent = 1;
}
-class SI_TCRETURN_Pseudo<RegisterClass rc, SDNode sd> : SPseudoInstSI <(outs),
- (ins rc:$src0, unknown:$callee, i32imm:$fpdiff),
- [(sd i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]> {
+class SI_TCRETURN_Pseudo<RegisterClass rc, list<dag> pattern = []>
+ : SPseudoInstSI <(outs), (ins rc:$src0, unknown:$callee, i32imm:$fpdiff), pattern> {
let Size = 4;
let FixedSize = 1;
let isCall = 1;
@@ -820,8 +839,15 @@ class SI_TCRETURN_Pseudo<RegisterClass rc, SDNode sd> : SPseudoInstSI <(outs),
}
// Tail call handling pseudo
-def SI_TCRETURN : SI_TCRETURN_Pseudo<CCR_SGPR_64, AMDGPUtc_return>;
-def SI_TCRETURN_GFX : SI_TCRETURN_Pseudo<Gfx_CCR_SGPR_64, AMDGPUtc_return_gfx>;
+def SI_TCRETURN : SI_TCRETURN_Pseudo<CCR_SGPR_64,
+ [(AMDGPUtc_return i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]>;
+def SI_TCRETURN_GFX : SI_TCRETURN_Pseudo<Gfx_CCR_SGPR_64,
+ [(AMDGPUtc_return_gfx i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]>;
+
+// Tail call for chain calling conventions.
+// Uses unrestricted SGPR_64 instead of CCR_SGPR_64 because chain calls
+// never return and don't need to preserve any SGPRs.
+def SI_TCRETURN_CHAIN : SI_TCRETURN_Pseudo<SGPR_64>;
// Handle selecting indirect tail calls
def : GCNPat<
@@ -851,13 +877,13 @@ multiclass SI_CS_CHAIN_TC<
// This is essentially a tail call, but it also takes a mask to put in EXEC
// right before jumping to the callee.
def NAME: SPseudoInstSI <(outs),
- (ins CCR_SGPR_64:$src0, unknown:$callee, i32imm:$fpdiff, execrc:$exec)>;
+ (ins SGPR_64:$src0, unknown:$callee, i32imm:$fpdiff, execrc:$exec)>;
// Same as above, but it will first try to reallocate the VGPRs, and choose an
// EXEC mask and a callee depending on the success of the reallocation attempt.
def _DVGPR : SPseudoInstSI <(outs),
- (ins CCR_SGPR_64:$src0, i64imm:$callee, i32imm:$fpdiff, execrc:$exec,
- SSrc_b32:$numvgprs, execrc:$fbexec, CCR_SGPR_64:$fbcallee)>;
+ (ins SGPR_64:$src0, i64imm:$callee, i32imm:$fpdiff, execrc:$exec,
+ SSrc_b32:$numvgprs, execrc:$fbexec, SGPR_64:$fbcallee)>;
} // End FixedSize = 0 etc
}
@@ -869,7 +895,7 @@ multiclass si_cs_chain_tc_pattern<
dag callee, ValueType execvt, RegisterOperand execrc, Instruction tc> {
def : GCNPat<
(AMDGPUtc_return_chain i64:$src0, callee, (i32 timm:$fpdiff), execvt:$exec),
- (tc CCR_SGPR_64:$src0, callee, i32imm:$fpdiff, execrc:$exec)
+ (tc SGPR_64:$src0, callee, i32imm:$fpdiff, execrc:$exec)
>;
}
@@ -896,14 +922,15 @@ multiclass si_cs_chain_tc_dvgpr_patterns<
(AMDGPUtc_return_chain_dvgpr i64:$src0, callee, (i32 timm:$fpdiff),
execvt:$exec, i32:$numvgprs,
execvt:$fbexec, i64:$fbcallee),
- (tc CCR_SGPR_64:$src0, (i64 0), i32imm:$fpdiff, execrc:$exec,
- SSrc_b32:$numvgprs, execrc:$fbexec, CCR_SGPR_64:$fbcallee)
+ (tc SGPR_64:$src0, (i64 0), i32imm:$fpdiff, execrc:$exec,
+ SSrc_b32:$numvgprs, execrc:$fbexec, SGPR_64:$fbcallee)
>;
}
}
defm : si_cs_chain_tc_dvgpr_patterns<i32>; // On GFX12, dVGPR mode is wave32-only.
+let Defs = [SCC] in {
def ADJCALLSTACKUP : SPseudoInstSI<
(outs), (ins i32imm:$amt0, i32imm:$amt1),
[(callseq_start timm:$amt0, timm:$amt1)],
@@ -913,7 +940,6 @@ def ADJCALLSTACKUP : SPseudoInstSI<
let hasSideEffects = 1;
let usesCustomInserter = 1;
let SchedRW = [WriteSALU];
- let Defs = [SCC];
}
def ADJCALLSTACKDOWN : SPseudoInstSI<
@@ -924,9 +950,16 @@ def ADJCALLSTACKDOWN : SPseudoInstSI<
let hasSideEffects = 1;
let usesCustomInserter = 1;
let SchedRW = [WriteSALU];
- let Defs = [SCC];
}
+// Get the offset of the base of the stack, skipping any reserved areas.
+def GET_STACK_BASE : SPseudoInstSI<(outs SGPR_32:$dst), (ins),
+ [(set p5:$dst, (sponentry))]> {
+ let Size = 16; // Worst case (s_getreg, s_cmp, s_cselect + constant).
+ let SchedRW = [WriteSALU];
+}
+} // End Defs = [SCC]
+
let Defs = [M0, EXEC, SCC],
UseNamedOperandTable = 1 in {
@@ -947,7 +980,11 @@ class SI_INDIRECT_DST<RegisterClass rc> : VPseudoInstSI <
def SI_INDIRECT_SRC_V1 : SI_INDIRECT_SRC<VGPR_32>;
def SI_INDIRECT_SRC_V2 : SI_INDIRECT_SRC<VReg_64>;
+def SI_INDIRECT_SRC_V3 : SI_INDIRECT_SRC<VReg_96>;
def SI_INDIRECT_SRC_V4 : SI_INDIRECT_SRC<VReg_128>;
+def SI_INDIRECT_SRC_V5 : SI_INDIRECT_SRC<VReg_160>;
+def SI_INDIRECT_SRC_V6 : SI_INDIRECT_SRC<VReg_192>;
+def SI_INDIRECT_SRC_V7 : SI_INDIRECT_SRC<VReg_224>;
def SI_INDIRECT_SRC_V8 : SI_INDIRECT_SRC<VReg_256>;
def SI_INDIRECT_SRC_V9 : SI_INDIRECT_SRC<VReg_288>;
def SI_INDIRECT_SRC_V10 : SI_INDIRECT_SRC<VReg_320>;
@@ -958,7 +995,11 @@ def SI_INDIRECT_SRC_V32 : SI_INDIRECT_SRC<VReg_1024>;
def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VGPR_32>;
def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>;
+def SI_INDIRECT_DST_V3 : SI_INDIRECT_DST<VReg_96>;
def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>;
+def SI_INDIRECT_DST_V5 : SI_INDIRECT_DST<VReg_160>;
+def SI_INDIRECT_DST_V6 : SI_INDIRECT_DST<VReg_192>;
+def SI_INDIRECT_DST_V7 : SI_INDIRECT_DST<VReg_224>;
def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>;
def SI_INDIRECT_DST_V9 : SI_INDIRECT_DST<VReg_288>;
def SI_INDIRECT_DST_V10 : SI_INDIRECT_DST<VReg_320>;
@@ -1004,6 +1045,8 @@ def V_INDIRECT_REG_WRITE_MOVREL_B32_V2 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<
def V_INDIRECT_REG_WRITE_MOVREL_B32_V3 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_96>;
def V_INDIRECT_REG_WRITE_MOVREL_B32_V4 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_128>;
def V_INDIRECT_REG_WRITE_MOVREL_B32_V5 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_160>;
+def V_INDIRECT_REG_WRITE_MOVREL_B32_V6 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_192>;
+def V_INDIRECT_REG_WRITE_MOVREL_B32_V7 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_224>;
def V_INDIRECT_REG_WRITE_MOVREL_B32_V8 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_256>;
def V_INDIRECT_REG_WRITE_MOVREL_B32_V9 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_288>;
def V_INDIRECT_REG_WRITE_MOVREL_B32_V10 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_320>;
@@ -1017,6 +1060,8 @@ def S_INDIRECT_REG_WRITE_MOVREL_B32_V2 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<
def S_INDIRECT_REG_WRITE_MOVREL_B32_V3 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_96>;
def S_INDIRECT_REG_WRITE_MOVREL_B32_V4 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_128>;
def S_INDIRECT_REG_WRITE_MOVREL_B32_V5 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_160>;
+def S_INDIRECT_REG_WRITE_MOVREL_B32_V6 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_192>;
+def S_INDIRECT_REG_WRITE_MOVREL_B32_V7 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_224>;
def S_INDIRECT_REG_WRITE_MOVREL_B32_V8 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_256>;
def S_INDIRECT_REG_WRITE_MOVREL_B32_V9 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_288>;
def S_INDIRECT_REG_WRITE_MOVREL_B32_V10 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_320>;
@@ -1049,6 +1094,8 @@ def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VR
def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_96>;
def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_128>;
def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_160>;
+def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_192>;
+def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_224>;
def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_256>;
def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_288>;
def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_320>;
@@ -1069,6 +1116,8 @@ def V_INDIRECT_REG_READ_GPR_IDX_B32_V2 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg
def V_INDIRECT_REG_READ_GPR_IDX_B32_V3 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_96>;
def V_INDIRECT_REG_READ_GPR_IDX_B32_V4 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_128>;
def V_INDIRECT_REG_READ_GPR_IDX_B32_V5 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_160>;
+def V_INDIRECT_REG_READ_GPR_IDX_B32_V6 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_192>;
+def V_INDIRECT_REG_READ_GPR_IDX_B32_V7 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_224>;
def V_INDIRECT_REG_READ_GPR_IDX_B32_V8 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_256>;
def V_INDIRECT_REG_READ_GPR_IDX_B32_V9 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_288>;
def V_INDIRECT_REG_READ_GPR_IDX_B32_V10 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_320>;
@@ -1319,22 +1368,22 @@ multiclass f16_to_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_
>;
def : GCNPat <
- (f32 (f16_to_fp (and_oneuse i32:$src0, 0x7fff))),
+ (f32 (any_f16_to_fp (and_oneuse i32:$src0, 0x7fff))),
(cvt_f32_f16_inst_e64 SRCMODS.ABS, $src0)
>;
def : GCNPat <
- (f32 (f16_to_fp (i32 (srl_oneuse (and_oneuse i32:$src0, 0x7fff0000), (i32 16))))),
+ (f32 (any_f16_to_fp (i32 (srl_oneuse (and_oneuse i32:$src0, 0x7fff0000), (i32 16))))),
(cvt_f32_f16_inst_e64 SRCMODS.ABS, (i32 (V_LSHRREV_B32_e64 (i32 16), i32:$src0)))
>;
def : GCNPat <
- (f32 (f16_to_fp (or_oneuse i32:$src0, 0x8000))),
+ (f32 (any_f16_to_fp (or_oneuse i32:$src0, 0x8000))),
(cvt_f32_f16_inst_e64 SRCMODS.NEG_ABS, $src0)
>;
def : GCNPat <
- (f32 (f16_to_fp (xor_oneuse i32:$src0, 0x8000))),
+ (f32 (any_f16_to_fp (xor_oneuse i32:$src0, 0x8000))),
(cvt_f32_f16_inst_e64 SRCMODS.NEG, $src0)
>;
@@ -1429,7 +1478,7 @@ def : GCNPat <
// Don't allow source modifiers. If there are any source modifiers then it's
// better to select fma instead of fmac.
-let SubtargetPredicate = HasFmaLegacy32 in
+let SubtargetPredicate = HasFmacLegacy32 in
def : GCNPat <
(f32 (int_amdgcn_fma_legacy (VOP3NoMods f32:$src0),
(VOP3NoMods f32:$src1),
@@ -2223,8 +2272,8 @@ def : GCNPat <
def : GCNPat <
(DivergentUnaryFrag<fneg> (v2f32 VReg_64:$src)),
- (V_PK_ADD_F32 11 /* OP_SEL_1 | NEG_LO | HEG_HI */, VReg_64:$src,
- 11 /* OP_SEL_1 | NEG_LO | HEG_HI */, (i64 0),
+ (V_PK_ADD_F32 !or(SRCMODS.OP_SEL_1, SRCMODS.NEG, SRCMODS.NEG_HI), VReg_64:$src,
+ !or(SRCMODS.OP_SEL_1, SRCMODS.NEG, SRCMODS.NEG_HI), (i64 0),
0, 0, 0, 0, 0)
> {
let SubtargetPredicate = HasPackedFP32Ops;
@@ -2238,12 +2287,34 @@ def : GCNPat <
>;
def : GCNPat <
+ (UniformBinFrag<fcopysign> (fp16vt fpimm_pos_zero), fp16vt:$src1),
+ (S_AND_B32 (S_MOV_B32 (i32 0x00008000)), SReg_32:$src1)
+>;
+
+def : GCNPat <
+ (fcopysign (fp16vt fpimm_pos_zero), fp16vt:$src1),
+ (V_AND_B32_e32 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src1)
+>;
+
+def : GCNPat <
(fcopysign f32:$src0, fp16vt:$src1),
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0,
(V_LSHLREV_B32_e64 (i32 16), $src1))
>;
def : GCNPat <
+ (UniformBinFrag<fcopysign> (f32 fpimm_pos_zero), fp16vt:$src1),
+ (S_AND_B32 (S_MOV_B32 (i32 0x80000000)),
+ (S_LSHL_B32 SReg_32:$src1, (i32 16)))
+>;
+
+def : GCNPat <
+ (fcopysign (f32 fpimm_pos_zero), fp16vt:$src1),
+ (V_AND_B32_e32 (S_MOV_B32 (i32 0x80000000)),
+ (V_LSHLREV_B32_e64 (i32 16), VGPR_32:$src1))
+>;
+
+def : GCNPat <
(fcopysign f64:$src0, fp16vt:$src1),
(REG_SEQUENCE SReg_64,
(i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
@@ -2258,6 +2329,18 @@ def : GCNPat <
>;
def : GCNPat <
+ (UniformBinFrag<fcopysign> (fp16vt fpimm_pos_zero), f32:$src1),
+ (S_AND_B32 (S_MOV_B32 (i32 0x00008000)),
+ (S_LSHR_B32 SReg_32:$src1, (i32 16)))
+>;
+
+def : GCNPat <
+ (fcopysign (fp16vt fpimm_pos_zero), f32:$src1),
+ (V_AND_B32_e32 (S_MOV_B32 (i32 0x00008000)),
+ (V_LSHRREV_B32_e64 (i32 16), VGPR_32:$src1))
+>;
+
+def : GCNPat <
(fcopysign fp16vt:$src0, f64:$src1),
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0,
(V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1)))
@@ -2271,12 +2354,27 @@ def : GCNPat <
(REG_SEQUENCE VGPR_32, $src1, lo16, (i16 (IMPLICIT_DEF)), hi16)), lo16)
>;
+// TODO: Scalar case for 0 magnitude special case
+def : GCNPat <
+ (fcopysign (fp16vt fpimm_pos_zero), fp16vt:$src1),
+ (V_AND_B16_t16_e64 0, (S_MOV_B32 (i32 0x00008000)), 0, VGPR_16:$src1)
+>;
+
+// TODO: Handle 0 magnitude special case
def : GCNPat <
(fcopysign f32:$src0, fp16vt:$src1),
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0,
(REG_SEQUENCE VGPR_32, (i16 (IMPLICIT_DEF)), lo16, $src1, hi16))
>;
+// TODO: Scalar case for 0 magnitude special case
+def : GCNPat <
+ (fcopysign (f32 fpimm_pos_zero), fp16vt:$src1),
+ (REG_SEQUENCE VGPR_32,
+ (V_MOV_B16_t16_e64 0, (i16 0), 0), lo16,
+ (V_AND_B16_t16_e64 0, (S_MOV_B32 (i32 0x00008000)), 0, VGPR_16:$src1), hi16)
+>;
+
def : GCNPat <
(fcopysign f64:$src0, fp16vt:$src1),
(REG_SEQUENCE VReg_64,
@@ -2292,6 +2390,13 @@ def : GCNPat <
(V_LSHRREV_B32_e64 (i32 16), $src1)), lo16)
>;
+// TODO: Scalar case for 0 magnitude special case
+def : GCNPat <
+ (fcopysign (fp16vt fpimm_pos_zero), f32:$src1),
+ (V_AND_B16_t16_e64 0, (S_MOV_B32 (i32 0x00008000)),
+ 0, (EXTRACT_SUBREG VGPR_32:$src1, hi16))
+>;
+
def : GCNPat <
(fcopysign fp16vt:$src0, f64:$src1),
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)),
@@ -2309,6 +2414,16 @@ def : GCNPat <
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fff7fff)), $src0, $src1)
>;
+def : GCNPat <
+ (UniformBinFrag<fcopysign> build_vector_fpimm_pos_zero_v2<fp16vt>, fp16vt:$src1),
+ (S_AND_B32 (S_MOV_B32 (i32 0x80008000)), SReg_32:$src1)
+>;
+
+def : GCNPat <
+ (fcopysign build_vector_fpimm_pos_zero_v2<fp16vt>, fp16vt:$src1),
+ (V_AND_B32_e32 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src1)
+>;
+
}
/********** ================== **********/
@@ -2638,12 +2753,34 @@ def : AMDGPUPat <
>;
def : AMDGPUPat <
+ (UniformBinFrag<fcopysign> (f32 fpimm_pos_zero), (f32 SReg_32:$src1)),
+ (S_AND_B32 (S_MOV_B32 (i32 0x80000000)), $src1)
+>;
+
+def : AMDGPUPat <
+ (fcopysign (f32 fpimm_pos_zero), (f32 VGPR_32:$src1)),
+ (V_AND_B32_e32 (S_MOV_B32 (i32 0x80000000)), $src1)
+>;
+
+def : AMDGPUPat <
(fcopysign f32:$src0, f64:$src1),
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0,
(i32 (EXTRACT_SUBREG SReg_64:$src1, sub1)))
>;
def : AMDGPUPat <
+ (UniformBinFrag<fcopysign> (f32 fpimm_pos_zero), SReg_64:$src1),
+ (S_AND_B32 (i32 (S_MOV_B32 (i32 0x80000000))),
+ (i32 (EXTRACT_SUBREG SReg_64:$src1, sub1)))
+>;
+
+def : AMDGPUPat <
+ (fcopysign (f32 fpimm_pos_zero), VReg_64:$src1),
+ (V_AND_B32_e32 (i32 (S_MOV_B32 (i32 0x80000000))),
+ (i32 (EXTRACT_SUBREG SReg_64:$src1, sub1)))
+>;
+
+def : AMDGPUPat <
(fcopysign f64:$src0, f64:$src1),
(REG_SEQUENCE SReg_64,
(i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
@@ -2663,8 +2800,6 @@ def : AMDGPUPat <
let True16Predicate = NotHasTrue16BitInsts in {
let SubtargetPredicate = isNotGFX9Plus in {
-def : ROTRPattern <V_ALIGNBIT_B32_e64>;
-
def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (and i32:$src1, (i32 31))))),
(V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
(i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
@@ -2675,14 +2810,6 @@ def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (i32 ShiftAmt32Imm:
} // isNotGFX9Plus
let SubtargetPredicate = isGFX9GFX10 in {
-def : GCNPat <
- (rotr i32:$src0, i32:$src1),
- (V_ALIGNBIT_B32_opsel_e64 /* src0_modifiers */ 0, $src0,
- /* src1_modifiers */ 0, $src0,
- /* src2_modifiers */ 0,
- $src1, /* clamp */ 0, /* op_sel */ 0)
->;
-
foreach pat = [(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (and i32:$src1, (i32 31))))),
(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (i32 ShiftAmt32Imm:$src1))))] in
def : GCNPat<pat,
@@ -2704,15 +2831,6 @@ def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2),
} // end True16Predicate = NotHasTrue16BitInsts
let True16Predicate = UseRealTrue16Insts in {
-def : GCNPat <
- (rotr i32:$src0, i32:$src1),
- (V_ALIGNBIT_B32_t16_e64 /* src0_modifiers */ 0, $src0,
- /* src1_modifiers */ 0, $src0,
- /* src2_modifiers */ 0,
- (EXTRACT_SUBREG $src1, lo16),
- /* clamp */ 0, /* op_sel */ 0)
->;
-
def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
(V_ALIGNBIT_B32_t16_e64 0, /* src0_modifiers */
(i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
@@ -2731,14 +2849,6 @@ def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2),
} // end True16Predicate = UseRealTrue16Insts
let True16Predicate = UseFakeTrue16Insts in {
-def : GCNPat <
- (rotr i32:$src0, i32:$src1),
- (V_ALIGNBIT_B32_fake16_e64 /* src0_modifiers */ 0, $src0,
- /* src1_modifiers */ 0, $src0,
- /* src2_modifiers */ 0,
- $src1, /* clamp */ 0, /* op_sel */ 0)
->;
-
def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (and i32:$src1, (i32 31))))),
(V_ALIGNBIT_B32_fake16_e64 0, /* src0_modifiers */
(i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
@@ -2784,7 +2894,11 @@ multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, string VecSize> {
}
defm : SI_INDIRECT_Pattern <v2f32, f32, "V2">;
+defm : SI_INDIRECT_Pattern<v3f32, f32, "V3">;
defm : SI_INDIRECT_Pattern <v4f32, f32, "V4">;
+defm : SI_INDIRECT_Pattern<v5f32, f32, "V5">;
+defm : SI_INDIRECT_Pattern<v6f32, f32, "V6">;
+defm : SI_INDIRECT_Pattern<v7f32, f32, "V7">;
defm : SI_INDIRECT_Pattern <v8f32, f32, "V8">;
defm : SI_INDIRECT_Pattern <v9f32, f32, "V9">;
defm : SI_INDIRECT_Pattern <v10f32, f32, "V10">;
@@ -2794,7 +2908,11 @@ defm : SI_INDIRECT_Pattern <v16f32, f32, "V16">;
defm : SI_INDIRECT_Pattern <v32f32, f32, "V32">;
defm : SI_INDIRECT_Pattern <v2i32, i32, "V2">;
+defm : SI_INDIRECT_Pattern<v3i32, i32, "V3">;
defm : SI_INDIRECT_Pattern <v4i32, i32, "V4">;
+defm : SI_INDIRECT_Pattern<v5i32, i32, "V5">;
+defm : SI_INDIRECT_Pattern<v6i32, i32, "V6">;
+defm : SI_INDIRECT_Pattern<v7i32, i32, "V7">;
defm : SI_INDIRECT_Pattern <v8i32, i32, "V8">;
defm : SI_INDIRECT_Pattern <v9i32, i32, "V9">;
defm : SI_INDIRECT_Pattern <v10i32, i32, "V10">;
@@ -2930,15 +3048,25 @@ def : GCNPat <
>;
def : GCNPat <
- (i64 (zext i32:$src)),
+ (i64 (UniformUnaryFrag<zext> i32:$src)),
(REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 (i32 0)), sub1)
>;
def : GCNPat <
- (i64 (anyext i32:$src)),
+ (i64 (zext i32:$src)),
+ (REG_SEQUENCE VReg_64, $src, sub0, (V_MOV_B32_e32 (i32 0)), sub1)
+>;
+
+def : GCNPat <
+ (i64 (UniformUnaryFrag<anyext> i32:$src)),
(REG_SEQUENCE SReg_64, $src, sub0, (i32 (IMPLICIT_DEF)), sub1)
>;
+def : GCNPat <
+ (i64 (anyext i32:$src)),
+ (REG_SEQUENCE VReg_64, $src, sub0, (i32 (IMPLICIT_DEF)), sub1)
+>;
+
class ZExt_i64_i1_Pat <SDNode ext> : GCNPat <
(i64 (ext i1:$src)),
(REG_SEQUENCE VReg_64,
@@ -3459,10 +3587,7 @@ def : GCNPat<
// If fcanonicalize's operand is implicitly canonicalized, we only need a copy.
let AddedComplexity = 8 in {
foreach vt = [f16, v2f16, f32, v2f32, f64] in {
- def : GCNPat<
- (fcanonicalize (vt is_canonicalized:$src)),
- (COPY vt:$src)
- >;
+ def : GCNPat<(fcanonicalize_canonicalized vt:$src), (COPY vt:$src)>;
}
}
@@ -3481,30 +3606,6 @@ def : GCNPat<
>;
} // End True16Predicate
-let True16Predicate = UseRealTrue16Insts in {
-def : GCNPat<
- (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
- (V_MUL_F16_t16_e64 0, (i16 CONST.FP16_ONE), $src_mods, $src, 0/*Clamp*/, /*omod*/0, /*opsel*/0)
->;
-
-def : GCNPat<
- (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))),
- (V_MUL_F16_t16_e64 0, (i16 CONST.FP16_NEG_ONE), $src_mods, $src, 0/*Clamp*/, /*omod*/0, /*opsel*/0)
->;
-} // End True16Predicate
-
-let True16Predicate = UseFakeTrue16Insts in {
-def : GCNPat<
- (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
- (V_MUL_F16_fake16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src)
->;
-
-def : GCNPat<
- (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))),
- (V_MUL_F16_fake16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src)
->;
-} // End True16Predicate
-
def : GCNPat<
(fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
(V_PK_MUL_F16 0, (i32 CONST.FP16_ONE), $src_mods, $src, DSTCLAMP.NONE)
@@ -3663,8 +3764,6 @@ def : GCNPat <
SRCMODS.NONE, $src2)
>;
-// COPY is workaround tablegen bug from multiple outputs
-// from S_LSHL_B32's multiple outputs from implicit scc def.
let AddedComplexity = 1 in {
def : GCNPat <
(v2i16 (UniformBinFrag<build_vector> (i16 0), (i16 SReg_32:$src1))),
@@ -3683,7 +3782,7 @@ def : GCNPat <
>;
def : GCNPat <
- (v2f16 (DivergentBinFrag<build_vector> (f16 VGPR_32:$src1), (f16 FP_ZERO))),
+ (v2f16 (DivergentBinFrag<build_vector> (f16 VGPR_32:$src1), (f16 fpimm_pos_zero))),
(v2f16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1))
>;
}
@@ -3694,7 +3793,7 @@ def : GCNPat <
>;
def : GCNPat <
- (v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src1), (f16 FP_ZERO))),
+ (v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src1), (f16 fpimm_pos_zero))),
(S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1)
>;
@@ -3879,9 +3978,6 @@ let SubtargetPredicate = isGFX9Plus in {
let True16Predicate = NotHasTrue16BitInsts in
def : PackB32Pat<V_PACK_B32_F16_e64>;
-let True16Predicate = UseRealTrue16Insts in
- def : PackB32Pat<V_PACK_B32_F16_t16_e64>;
-
let True16Predicate = UseFakeTrue16Insts in
def : PackB32Pat<V_PACK_B32_F16_fake16_e64>;
} // End SubtargetPredicate = isGFX9Plus
@@ -4551,6 +4647,7 @@ def G_AMDGPU_BUFFER_ATOMIC_SMAX : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_UMAX : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_AND : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32 : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32 : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_OR : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction;
@@ -4737,6 +4834,23 @@ def G_AMDGPU_READANYLANE : AMDGPUGenericInstruction {
let hasSideEffects = 0;
}
+// llvm.sponentry
+def G_AMDGPU_SPONENTRY : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins);
+ let hasSideEffects = 0;
+}
+
+class LoadMonitorInstruction : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins ptype1:$ptr);
+ let hasSideEffects = 0;
+ let mayLoad = 1;
+}
+
+def G_AMDGPU_FLAT_LOAD_MONITOR : LoadMonitorInstruction;
+def G_AMDGPU_GLOBAL_LOAD_MONITOR : LoadMonitorInstruction;
+
//============================================================================//
// Dummy Instructions
//============================================================================//
@@ -4749,3 +4863,14 @@ def V_ILLEGAL : Enc32, InstSI<(outs), (ins), "v_illegal"> {
let hasSideEffects = 1;
let SubtargetPredicate = isGFX10Plus;
}
+
+defvar VGPR32_Ptr_Opcodes = [LOAD_STACK_GUARD];
+defvar VGPR64_Ptr_Opcodes = !listremove(PseudosWithPtrOps, VGPR32_Ptr_Opcodes);
+
+foreach inst = VGPR32_Ptr_Opcodes in {
+ def : RemapPointerOperands<inst, VGPR_32>;
+}
+
+foreach inst = VGPR64_Ptr_Opcodes in {
+ def : RemapPointerOperands<inst, VReg_64_AlignTarget>;
+}
diff --git a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
index 6537b79..83cf457 100644
--- a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
@@ -17,6 +17,7 @@
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachinePassManager.h"
#include "llvm/InitializePasses.h"
@@ -32,6 +33,7 @@ private:
const SIInstrInfo *TII;
const SIRegisterInfo *TRI;
MachineDominatorTree *MDT;
+ MachineLoopInfo *MLI;
const AMDGPU::LaneMaskConstants &LMC;
void expandChainCall(MachineInstr &MI, const GCNSubtarget &ST,
@@ -39,9 +41,10 @@ private:
void earlyTerm(MachineInstr &MI, MachineBasicBlock *EarlyExitBlock);
public:
- SILateBranchLowering(const GCNSubtarget &ST, MachineDominatorTree *MDT)
+ SILateBranchLowering(const GCNSubtarget &ST, MachineDominatorTree *MDT,
+ MachineLoopInfo *MLI)
: ST(ST), TII(ST.getInstrInfo()), TRI(&TII->getRegisterInfo()), MDT(MDT),
- LMC(AMDGPU::LaneMaskConstants::get(ST)) {}
+ MLI(MLI), LMC(AMDGPU::LaneMaskConstants::get(ST)) {}
bool run(MachineFunction &MF);
};
@@ -54,7 +57,9 @@ public:
bool runOnMachineFunction(MachineFunction &MF) override {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
auto *MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
- return SILateBranchLowering(ST, MDT).run(MF);
+ auto *MLIWP = getAnalysisIfAvailable<MachineLoopInfoWrapperPass>();
+ MachineLoopInfo *MLI = MLIWP ? &MLIWP->getLI() : nullptr;
+ return SILateBranchLowering(ST, MDT, MLI).run(MF);
}
StringRef getPassName() const override {
@@ -64,6 +69,7 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<MachineDominatorTreeWrapperPass>();
AU.addPreserved<MachineDominatorTreeWrapperPass>();
+ AU.addPreserved<MachineLoopInfoWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
};
@@ -117,7 +123,7 @@ static void generateEndPgm(MachineBasicBlock &MBB,
}
static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI,
- MachineDominatorTree *MDT) {
+ MachineDominatorTree *MDT, MachineLoopInfo *MLI) {
MachineBasicBlock *SplitBB = MBB.splitAt(MI, /*UpdateLiveIns*/ true);
// Update dominator tree
@@ -129,6 +135,12 @@ static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI,
}
DTUpdates.push_back({DomTreeT::Insert, &MBB, SplitBB});
MDT->applyUpdates(DTUpdates);
+
+ // Update loop info if available
+ if (MLI) {
+ if (MachineLoop *Loop = MLI->getLoopFor(&MBB))
+ Loop->addBasicBlockToLoop(SplitBB, *MLI);
+ }
}
static void copyOpWithoutRegFlags(MachineInstrBuilder &MIB,
@@ -186,20 +198,20 @@ void SILateBranchLowering::expandChainCall(MachineInstr &MI,
for (int OpIdx = MI.getNumExplicitOperands() - 1; OpIdx >= ExecIdx; --OpIdx)
MI.removeOperand(OpIdx);
- MI.setDesc(TII->get(AMDGPU::SI_TCRETURN));
+ MI.setDesc(TII->get(AMDGPU::SI_TCRETURN_CHAIN));
}
void SILateBranchLowering::earlyTerm(MachineInstr &MI,
MachineBasicBlock *EarlyExitBlock) {
MachineBasicBlock &MBB = *MI.getParent();
- const DebugLoc DL = MI.getDebugLoc();
+ const DebugLoc &DL = MI.getDebugLoc();
auto BranchMI = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC0))
.addMBB(EarlyExitBlock);
auto Next = std::next(MI.getIterator());
if (Next != MBB.end() && !Next->isTerminator())
- splitBlock(MBB, *BranchMI, MDT);
+ splitBlock(MBB, *BranchMI, MDT, MLI);
MBB.addSuccessor(EarlyExitBlock);
MDT->insertEdge(&MBB, EarlyExitBlock);
@@ -210,11 +222,14 @@ llvm::SILateBranchLoweringPass::run(MachineFunction &MF,
MachineFunctionAnalysisManager &MFAM) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
auto *MDT = &MFAM.getResult<MachineDominatorTreeAnalysis>(MF);
- if (!SILateBranchLowering(ST, MDT).run(MF))
+ auto *MLI = MFAM.getCachedResult<MachineLoopAnalysis>(MF);
+ if (!SILateBranchLowering(ST, MDT, MLI).run(MF))
return PreservedAnalyses::all();
- return getMachineFunctionPassPreservedAnalyses()
- .preserve<MachineDominatorTreeAnalysis>();
+ auto PA = getMachineFunctionPassPreservedAnalyses();
+ PA.preserve<MachineDominatorTreeAnalysis>();
+ PA.preserve<MachineLoopAnalysis>();
+ return PA;
}
bool SILateBranchLowering::run(MachineFunction &MF) {
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index f0d1117..0141c36 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -193,6 +193,8 @@ class SILoadStoreOptimizer {
unsigned LoSubReg = 0;
unsigned HiSubReg = 0;
+ // True when using V_ADD_U64_e64 pattern
+ bool UseV64Pattern = false;
};
struct MemAddress {
@@ -233,10 +235,11 @@ private:
void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired,
MachineBasicBlock::iterator InsertBefore,
- AMDGPU::OpName OpName, Register DestReg) const;
+ const DebugLoc &DL, AMDGPU::OpName OpName,
+ Register DestReg) const;
Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
MachineBasicBlock::iterator InsertBefore,
- AMDGPU::OpName OpName) const;
+ const DebugLoc &DL, AMDGPU::OpName OpName) const;
unsigned read2Opcode(unsigned EltSize) const;
unsigned read2ST64Opcode(unsigned EltSize) const;
@@ -278,9 +281,12 @@ private:
void updateBaseAndOffset(MachineInstr &I, Register NewBase,
int32_t NewOffset) const;
+ void updateAsyncLDSAddress(MachineInstr &MI, int32_t OffsetDiff) const;
Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
- std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
+ bool processBaseWithConstOffset64(MachineInstr *AddDef,
+ const MachineOperand &Base,
+ MemAddress &Addr) const;
void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
/// Promotes constant offset to the immediate by adjusting the base. It
/// tries to use a base from the nearby instructions that allows it to have
@@ -1336,11 +1342,9 @@ SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
int Data1Idx = AMDGPU::getNamedOperandIdx(Write2Opc.getOpcode(),
AMDGPU::OpName::data1);
- const TargetRegisterClass *DataRC0 =
- TII->getRegClass(Write2Opc, Data0Idx, TRI);
+ const TargetRegisterClass *DataRC0 = TII->getRegClass(Write2Opc, Data0Idx);
- const TargetRegisterClass *DataRC1 =
- TII->getRegClass(Write2Opc, Data1Idx, TRI);
+ const TargetRegisterClass *DataRC1 = TII->getRegClass(Write2Opc, Data1Idx);
if (unsigned SubReg = Data0->getSubReg()) {
DataRC0 = TRI->getMatchingSuperRegClass(MRI->getRegClass(Data0->getReg()),
@@ -1367,10 +1371,9 @@ SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
// Paired.
void SILoadStoreOptimizer::copyToDestRegs(
CombineInfo &CI, CombineInfo &Paired,
- MachineBasicBlock::iterator InsertBefore, AMDGPU::OpName OpName,
- Register DestReg) const {
+ MachineBasicBlock::iterator InsertBefore, const DebugLoc &DL,
+ AMDGPU::OpName OpName, Register DestReg) const {
MachineBasicBlock *MBB = CI.I->getParent();
- DebugLoc DL = CI.I->getDebugLoc();
auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
@@ -1387,7 +1390,7 @@ void SILoadStoreOptimizer::copyToDestRegs(
BuildMI(*MBB, InsertBefore, DL, CopyDesc)
.add(*Dest0) // Copy to same destination including flags and sub reg.
- .addReg(DestReg, 0, SubRegIdx0);
+ .addReg(DestReg, {}, SubRegIdx0);
BuildMI(*MBB, InsertBefore, DL, CopyDesc)
.add(*Dest1)
.addReg(DestReg, RegState::Kill, SubRegIdx1);
@@ -1398,9 +1401,9 @@ void SILoadStoreOptimizer::copyToDestRegs(
Register
SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
MachineBasicBlock::iterator InsertBefore,
+ const DebugLoc &DL,
AMDGPU::OpName OpName) const {
MachineBasicBlock *MBB = CI.I->getParent();
- DebugLoc DL = CI.I->getDebugLoc();
auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
@@ -1456,11 +1459,12 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
Register DestReg = MRI->createVirtualRegister(SuperRC);
- DebugLoc DL = CI.I->getDebugLoc();
+ DebugLoc DL =
+ DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
Register BaseReg = AddrReg->getReg();
unsigned BaseSubReg = AddrReg->getSubReg();
- unsigned BaseRegFlags = 0;
+ RegState BaseRegFlags = {};
if (CI.BaseOff) {
Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
@@ -1471,7 +1475,7 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
.addReg(ImmReg)
- .addReg(AddrReg->getReg(), 0, BaseSubReg)
+ .addReg(AddrReg->getReg(), {}, BaseSubReg)
.addImm(0); // clamp bit
BaseSubReg = 0;
}
@@ -1484,7 +1488,7 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
.addImm(0) // gds
.cloneMergedMemRefs({&*CI.I, &*Paired.I});
- copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
+ copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdst, DestReg);
CI.I->eraseFromParent();
Paired.I->eraseFromParent();
@@ -1541,11 +1545,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
(NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
const MCInstrDesc &Write2Desc = TII->get(Opc);
- DebugLoc DL = CI.I->getDebugLoc();
+ DebugLoc DL =
+ DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
Register BaseReg = AddrReg->getReg();
unsigned BaseSubReg = AddrReg->getSubReg();
- unsigned BaseRegFlags = 0;
+ RegState BaseRegFlags = {};
if (CI.BaseOff) {
Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
@@ -1556,7 +1561,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
.addReg(ImmReg)
- .addReg(AddrReg->getReg(), 0, BaseSubReg)
+ .addReg(AddrReg->getReg(), {}, BaseSubReg)
.addImm(0); // clamp bit
BaseSubReg = 0;
}
@@ -1582,7 +1587,9 @@ MachineBasicBlock::iterator
SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
MachineBasicBlock::iterator InsertBefore) {
MachineBasicBlock *MBB = CI.I->getParent();
- DebugLoc DL = CI.I->getDebugLoc();
+ DebugLoc DL =
+ DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
+
const unsigned Opcode = getNewOpcode(CI, Paired);
const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
@@ -1607,7 +1614,7 @@ SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
- copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
+ copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata, DestReg);
CI.I->eraseFromParent();
Paired.I->eraseFromParent();
@@ -1618,7 +1625,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
CombineInfo &CI, CombineInfo &Paired,
MachineBasicBlock::iterator InsertBefore) {
MachineBasicBlock *MBB = CI.I->getParent();
- DebugLoc DL = CI.I->getDebugLoc();
+ DebugLoc DL =
+ DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
+
const unsigned Opcode = getNewOpcode(CI, Paired);
const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
@@ -1639,7 +1648,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
New.addImm(MergedOffset);
New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
- copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::sdst, DestReg);
+ copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::sdst, DestReg);
CI.I->eraseFromParent();
Paired.I->eraseFromParent();
@@ -1650,7 +1659,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
CombineInfo &CI, CombineInfo &Paired,
MachineBasicBlock::iterator InsertBefore) {
MachineBasicBlock *MBB = CI.I->getParent();
- DebugLoc DL = CI.I->getDebugLoc();
+
+ DebugLoc DL =
+ DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
const unsigned Opcode = getNewOpcode(CI, Paired);
@@ -1680,7 +1691,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
.addImm(0) // swz
.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
- copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
+ copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata, DestReg);
CI.I->eraseFromParent();
Paired.I->eraseFromParent();
@@ -1691,7 +1702,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
CombineInfo &CI, CombineInfo &Paired,
MachineBasicBlock::iterator InsertBefore) {
MachineBasicBlock *MBB = CI.I->getParent();
- DebugLoc DL = CI.I->getDebugLoc();
+
+ DebugLoc DL =
+ DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
const unsigned Opcode = getNewOpcode(CI, Paired);
@@ -1731,7 +1744,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
.addImm(0) // swz
.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
- copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
+ copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata, DestReg);
CI.I->eraseFromParent();
Paired.I->eraseFromParent();
@@ -1742,12 +1755,13 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
CombineInfo &CI, CombineInfo &Paired,
MachineBasicBlock::iterator InsertBefore) {
MachineBasicBlock *MBB = CI.I->getParent();
- DebugLoc DL = CI.I->getDebugLoc();
+ DebugLoc DL =
+ DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
const unsigned Opcode = getNewOpcode(CI, Paired);
Register SrcReg =
- copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
+ copyFromSrcRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata);
auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
.addReg(SrcReg, RegState::Kill);
@@ -1789,7 +1803,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
CombineInfo &CI, CombineInfo &Paired,
MachineBasicBlock::iterator InsertBefore) {
MachineBasicBlock *MBB = CI.I->getParent();
- DebugLoc DL = CI.I->getDebugLoc();
+
+ DebugLoc DL =
+ DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
const unsigned Opcode = getNewOpcode(CI, Paired);
@@ -1807,7 +1823,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
.addImm(CI.CPol)
.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
- copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
+ copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdst, DestReg);
CI.I->eraseFromParent();
Paired.I->eraseFromParent();
@@ -1818,12 +1834,14 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
CombineInfo &CI, CombineInfo &Paired,
MachineBasicBlock::iterator InsertBefore) {
MachineBasicBlock *MBB = CI.I->getParent();
- DebugLoc DL = CI.I->getDebugLoc();
+
+ DebugLoc DL =
+ DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
const unsigned Opcode = getNewOpcode(CI, Paired);
Register SrcReg =
- copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
+ copyFromSrcRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata);
auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
@@ -2094,12 +2112,13 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
CombineInfo &CI, CombineInfo &Paired,
MachineBasicBlock::iterator InsertBefore) {
MachineBasicBlock *MBB = CI.I->getParent();
- DebugLoc DL = CI.I->getDebugLoc();
+ DebugLoc DL =
+ DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
const unsigned Opcode = getNewOpcode(CI, Paired);
Register SrcReg =
- copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
+ copyFromSrcRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata);
auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
.addReg(SrcReg, RegState::Kill);
@@ -2149,8 +2168,35 @@ Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
const MemAddress &Addr) const {
MachineBasicBlock *MBB = MI.getParent();
MachineBasicBlock::iterator MBBI = MI.getIterator();
- DebugLoc DL = MI.getDebugLoc();
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
+ // Use V_ADD_U64_e64 when the original pattern used it (gfx1250+)
+ if (Addr.Base.UseV64Pattern) {
+ Register FullDestReg = MRI->createVirtualRegister(
+ TII->getRegClass(TII->get(AMDGPU::V_ADD_U64_e64), 0));
+
+ // Load the 64-bit offset into an SGPR pair if needed
+ Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ MachineInstr *MovOffset =
+ BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO),
+ OffsetReg)
+ .addImm(Addr.Offset);
+ MachineInstr *Add64 =
+ BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_U64_e64), FullDestReg)
+ .addReg(Addr.Base.LoReg)
+ .addReg(OffsetReg, RegState::Kill)
+ .addImm(0);
+ (void)MovOffset;
+ (void)Add64;
+ LLVM_DEBUG(dbgs() << " " << *MovOffset << "\n";
+ dbgs() << " " << *Add64 << "\n\n";);
+
+ return FullDestReg;
+ }
+
+ // Original carry-chain pattern (V_ADD_CO_U32 + V_ADDC_U32)
assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
Addr.Base.LoSubReg) &&
"Expected 32-bit Base-Register-Low!!");
@@ -2159,7 +2205,6 @@ Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
Addr.Base.HiSubReg) &&
"Expected 32-bit Base-Register-Hi!!");
- LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
MachineOperand OffsetHi =
createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
@@ -2171,23 +2216,19 @@ Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
MachineInstr *LoHalf =
- BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
- .addReg(CarryReg, RegState::Define)
- .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
- .add(OffsetLo)
- .addImm(0); // clamp bit
- (void)LoHalf;
- LLVM_DEBUG(dbgs() << " "; LoHalf->dump(););
+ BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
+ .addReg(CarryReg, RegState::Define)
+ .addReg(Addr.Base.LoReg, {}, Addr.Base.LoSubReg)
+ .add(OffsetLo)
+ .addImm(0); // clamp bit
MachineInstr *HiHalf =
- BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
- .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
- .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
- .add(OffsetHi)
- .addReg(CarryReg, RegState::Kill)
- .addImm(0); // clamp bit
- (void)HiHalf;
- LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
+ BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
+ .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
+ .addReg(Addr.Base.HiReg, {}, Addr.Base.HiSubReg)
+ .add(OffsetHi)
+ .addReg(CarryReg, RegState::Kill)
+ .addImm(0); // clamp bit
Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
MachineInstr *FullBase =
@@ -2196,8 +2237,13 @@ Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
.addImm(AMDGPU::sub0)
.addReg(DestSub1)
.addImm(AMDGPU::sub1);
+
+ (void)LoHalf;
+ (void)HiHalf;
(void)FullBase;
- LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";);
+ LLVM_DEBUG(dbgs() << " " << *LoHalf << "\n";
+ dbgs() << " " << *HiHalf << "\n";
+ dbgs() << " " << *FullBase << "\n\n";);
return FullDestReg;
}
@@ -2212,20 +2258,33 @@ void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
}
-std::optional<int32_t>
-SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
- if (Op.isImm())
- return Op.getImm();
+// Helper to extract a 64-bit constant offset from a V_ADD_U64_e64 instruction.
+// Returns true if successful, populating Addr with base register info and
+// offset.
+bool SILoadStoreOptimizer::processBaseWithConstOffset64(
+ MachineInstr *AddDef, const MachineOperand &Base, MemAddress &Addr) const {
+ if (!Base.isReg())
+ return false;
+
+ MachineOperand *Src0 = TII->getNamedOperand(*AddDef, AMDGPU::OpName::src0);
+ MachineOperand *Src1 = TII->getNamedOperand(*AddDef, AMDGPU::OpName::src1);
- if (!Op.isReg())
- return std::nullopt;
+ const MachineOperand *BaseOp = nullptr;
- MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
- if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
- !Def->getOperand(1).isImm())
- return std::nullopt;
+ auto Offset = TII->getImmOrMaterializedImm(*Src1);
- return Def->getOperand(1).getImm();
+ if (Offset) {
+ BaseOp = Src0;
+ Addr.Offset = *Offset;
+ } else {
+ // Both or neither are constants - can't handle this pattern
+ return false;
+ }
+
+ // Now extract the base register (which should be a 64-bit VGPR).
+ Addr.Base.LoReg = BaseOp->getReg();
+ Addr.Base.UseV64Pattern = true;
+ return true;
}
// Analyze Base and extracts:
@@ -2238,14 +2297,27 @@ SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
// %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
// %Base:vreg_64 =
// REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
+//
+// Also handles V_ADD_U64_e64 pattern (gfx1250+):
+// %OFFSET:sreg_64 = S_MOV_B64_IMM_PSEUDO 256
+// %Base:vreg_64 = V_ADD_U64_e64 %BASE:vreg_64, %OFFSET:sreg_64, 0
void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
MemAddress &Addr) const {
if (!Base.isReg())
return;
MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
- if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
- || Def->getNumOperands() != 5)
+ if (!Def)
+ return;
+
+ // Try V_ADD_U64_e64 pattern first (simpler, used on gfx1250+)
+ if (Def->getOpcode() == AMDGPU::V_ADD_U64_e64) {
+ if (processBaseWithConstOffset64(Def, Base, Addr))
+ return;
+ }
+
+ // Fall through to REG_SEQUENCE + V_ADD_CO_U32 + V_ADDC_U32 pattern
+ if (Def->getOpcode() != AMDGPU::REG_SEQUENCE || Def->getNumOperands() != 5)
return;
MachineOperand BaseLo = Def->getOperand(1);
@@ -2260,14 +2332,14 @@ void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base
!BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
return;
- const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
- const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
+ MachineOperand *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
+ MachineOperand *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
- auto Offset0P = extractConstOffset(*Src0);
+ auto Offset0P = TII->getImmOrMaterializedImm(*Src0);
if (Offset0P)
BaseLo = *Src1;
else {
- if (!(Offset0P = extractConstOffset(*Src1)))
+ if (!(Offset0P = TII->getImmOrMaterializedImm(*Src1)))
return;
BaseLo = *Src0;
}
@@ -2297,6 +2369,32 @@ void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base
Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
}
+// Maintain the correct LDS address for async loads and stores.
+// It becomes incorrect when promoteConstantOffsetToImm adds an offset only
+// meant for the global address operand. For async loads the LDS address is in
+// vdst. For async stores, the LDS address is in vdata.
+void SILoadStoreOptimizer::updateAsyncLDSAddress(MachineInstr &MI,
+ int32_t OffsetDiff) const {
+ if (!TII->usesASYNC_CNT(MI) || OffsetDiff == 0)
+ return;
+
+ MachineOperand *LDSAddr = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+ if (!LDSAddr)
+ LDSAddr = TII->getNamedOperand(MI, AMDGPU::OpName::vdata);
+ assert(LDSAddr);
+
+ Register OldReg = LDSAddr->getReg();
+ Register NewReg = MRI->createVirtualRegister(MRI->getRegClass(OldReg));
+ MachineBasicBlock &MBB = *MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), NewReg)
+ .addReg(OldReg)
+ .addImm(-OffsetDiff)
+ .addImm(0);
+
+ LDSAddr->setReg(NewReg);
+}
+
bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
MachineInstr &MI,
MemInfoMap &Visited,
@@ -2426,7 +2524,9 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
// Instead of moving up, just re-compute anchor-instruction's base address.
Register Base = computeBase(MI, AnchorAddr);
- updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
+ int32_t OffsetDiff = MAddr.Offset - AnchorAddr.Offset;
+ updateBaseAndOffset(MI, Base, OffsetDiff);
+ updateAsyncLDSAddress(MI, OffsetDiff);
LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump(););
for (auto [OtherMI, OtherOffset] : InstsWCommonBase) {
@@ -2437,7 +2537,9 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
if (TLI->isLegalFlatAddressingMode(AM, AS)) {
LLVM_DEBUG(dbgs() << " Promote Offset(" << OtherOffset; dbgs() << ")";
OtherMI->dump());
- updateBaseAndOffset(*OtherMI, Base, OtherOffset - AnchorAddr.Offset);
+ int32_t OtherOffsetDiff = OtherOffset - AnchorAddr.Offset;
+ updateBaseAndOffset(*OtherMI, Base, OtherOffsetDiff);
+ updateAsyncLDSAddress(*OtherMI, OtherOffsetDiff);
LLVM_DEBUG(dbgs() << " After promotion: "; OtherMI->dump());
}
}
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 8586d6c..9cc86e8 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -56,6 +56,7 @@
#include "llvm/ADT/SmallSet.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachinePostDominators.h"
@@ -160,6 +161,7 @@ public:
AU.addPreserved<SlotIndexesWrapperPass>();
AU.addPreserved<LiveIntervalsWrapperPass>();
AU.addPreserved<LiveVariablesWrapperPass>();
+ AU.addPreserved<MachineBlockFrequencyInfoWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
};
@@ -742,6 +744,11 @@ bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) {
if (PDT)
PDT->applyUpdates(DTUpdates);
+ if (MDT && MDT->getNode(&MBB))
+ MDT->eraseNode(&MBB);
+ if (PDT && PDT->getNode(&MBB))
+ PDT->eraseNode(&MBB);
+
MBB.clear();
MBB.eraseFromParent();
if (FallThrough && !FallThrough->isLayoutSuccessor(Succ)) {
@@ -880,5 +887,6 @@ SILowerControlFlowPass::run(MachineFunction &MF,
PA.preserve<SlotIndexesAnalysis>();
PA.preserve<LiveIntervalsAnalysis>();
PA.preserve<LiveVariablesAnalysis>();
+ PA.preserve<MachineBlockFrequencyAnalysis>();
return PA;
}
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index 96131bd..0b8c71a 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -75,7 +75,7 @@ Vreg1LoweringHelper::Vreg1LoweringHelper(MachineFunction *MF,
bool Vreg1LoweringHelper::cleanConstrainRegs(bool Changed) {
assert(Changed || ConstrainRegs.empty());
for (Register Reg : ConstrainRegs)
- MRI->constrainRegClass(Reg, &AMDGPU::SReg_1_XEXECRegClass);
+ MRI->constrainRegClass(Reg, TII->getRegisterInfo().getWaveMaskRegClass());
ConstrainRegs.clear();
return Changed;
@@ -417,7 +417,7 @@ bool Vreg1LoweringHelper::lowerCopiesFromI1() {
// Copy into a 32-bit vector register.
LLVM_DEBUG(dbgs() << "Lower copy from i1: " << MI);
- DebugLoc DL = MI.getDebugLoc();
+ const DebugLoc &DL = MI.getDebugLoc();
assert(isVRegCompatibleReg(TII->getRegisterInfo(), *MRI, DstReg));
assert(!MI.getOperand(0).getSubReg());
@@ -616,7 +616,7 @@ bool Vreg1LoweringHelper::lowerCopiesToI1() {
if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF)
continue;
- DebugLoc DL = MI.getDebugLoc();
+ const DebugLoc &DL = MI.getDebugLoc();
Register SrcReg = MI.getOperand(1).getReg();
assert(!MI.getOperand(1).getSubReg());
@@ -881,18 +881,14 @@ SILowerI1CopiesPass::run(MachineFunction &MF,
return PreservedAnalyses::all();
// TODO: Probably preserves most.
- PreservedAnalyses PA;
- PA.preserveSet<CFGAnalyses>();
- return PA;
+ return getMachineFunctionPassPreservedAnalyses().preserveSet<CFGAnalyses>();
}
class SILowerI1CopiesLegacy : public MachineFunctionPass {
public:
static char ID;
- SILowerI1CopiesLegacy() : MachineFunctionPass(ID) {
- initializeSILowerI1CopiesLegacyPass(*PassRegistry::getPassRegistry());
- }
+ SILowerI1CopiesLegacy() : MachineFunctionPass(ID) {}
bool runOnMachineFunction(MachineFunction &MF) override;
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 40eeeb8..cbd08f0 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -117,27 +117,26 @@ static void insertCSRSaves(MachineBasicBlock &SaveBlock,
MachineFunction &MF = *SaveBlock.getParent();
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
- const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *RI = ST.getRegisterInfo();
MachineBasicBlock::iterator I = SaveBlock.begin();
- if (!TFI->spillCalleeSavedRegisters(SaveBlock, I, CSI, TRI)) {
+ if (!TFI->spillCalleeSavedRegisters(SaveBlock, I, CSI, RI)) {
for (const CalleeSavedInfo &CS : CSI) {
// Insert the spill to the stack frame.
MCRegister Reg = CS.getReg();
MachineInstrSpan MIS(I, &SaveBlock);
- const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(
+ const TargetRegisterClass *RC = RI->getMinimalPhysRegClass(
Reg, Reg == RI->getReturnAddressReg(MF) ? MVT::i64 : MVT::i32);
// If this value was already livein, we probably have a direct use of the
// incoming register value, so don't kill at the spill point. This happens
// since we pass some special inputs (workgroup IDs) in the callee saved
// range.
- const bool IsLiveIn = isLiveIntoMBB(Reg, SaveBlock, TRI);
+ const bool IsLiveIn = isLiveIntoMBB(Reg, SaveBlock, RI);
TII.storeRegToStackSlot(SaveBlock, I, Reg, !IsLiveIn, CS.getFrameIdx(),
- RC, TRI, Register());
+ RC, Register());
if (Indexes) {
assert(std::distance(MIS.begin(), I) == 1);
diff --git a/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp b/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp
index ef384c2..4aa4186 100644
--- a/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp
@@ -53,9 +53,7 @@ class SILowerWWMCopiesLegacy : public MachineFunctionPass {
public:
static char ID;
- SILowerWWMCopiesLegacy() : MachineFunctionPass(ID) {
- initializeSILowerWWMCopiesLegacyPass(*PassRegistry::getPassRegistry());
- }
+ SILowerWWMCopiesLegacy() : MachineFunctionPass(ID) {}
bool runOnMachineFunction(MachineFunction &MF) override;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index b398db4..af3226d 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -37,7 +37,7 @@ static cl::opt<bool, true> MFMAVGPRFormOpt(
"amdgpu-mfma-vgpr-form",
cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If "
"unspecified, default to compiler heuristics"),
- cl::location(SIMachineFunctionInfo::MFMAVGPRForm), cl::init(false),
+ cl::location(SIMachineFunctionInfo::MFMAVGPRForm), cl::init(true),
cl::Hidden);
const GCNTargetMachine &getTM(const GCNSubtarget *STI) {
@@ -114,7 +114,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
FrameOffsetReg = AMDGPU::SGPR33;
StackPtrOffsetReg = AMDGPU::SGPR32;
- if (!ST.enableFlatScratch()) {
+ if (!ST.hasFlatScratchEnabled()) {
// Non-entry functions have no special inputs for now, other registers
// required for scratch access.
ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
@@ -169,7 +169,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
if (WorkItemIDZ)
WorkItemIDY = true;
- if (!ST.flatScratchIsArchitected()) {
+ if (!ST.hasArchitectedFlatScratch()) {
PrivateSegmentWaveByteOffset = true;
// HS and GS always have the scratch wave offset in SGPR5 on GFX9.
@@ -692,11 +692,10 @@ convertArgumentInfo(const AMDGPUFunctionArgInfo &ArgInfo,
if (Arg.isMasked())
SA.Mask = Arg.getMask();
- A = SA;
+ A = std::move(SA);
return true;
};
- // TODO: Need to serialize kernarg preloads.
bool Any = false;
Any |= convertArg(AI.PrivateSegmentBuffer, ArgInfo.PrivateSegmentBuffer);
Any |= convertArg(AI.DispatchPtr, ArgInfo.DispatchPtr);
@@ -718,6 +717,21 @@ convertArgumentInfo(const AMDGPUFunctionArgInfo &ArgInfo,
Any |= convertArg(AI.WorkItemIDY, ArgInfo.WorkItemIDY);
Any |= convertArg(AI.WorkItemIDZ, ArgInfo.WorkItemIDZ);
+ // Write FirstKernArgPreloadReg separately, since it's a Register,
+ // not ArgDescriptor.
+ if (ArgInfo.FirstKernArgPreloadReg) {
+ Register Reg = ArgInfo.FirstKernArgPreloadReg;
+ assert(Reg.isPhysical() &&
+ "FirstKernArgPreloadReg must be a physical register");
+
+ yaml::SIArgument SA = yaml::SIArgument::createArgument(true);
+ raw_string_ostream OS(SA.RegisterName.Value);
+ OS << printReg(Reg, &TRI);
+
+ AI.FirstKernArgPreloadReg = SA;
+ Any = true;
+ }
+
if (Any)
return AI;
@@ -730,9 +744,8 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
: ExplicitKernArgSize(MFI.getExplicitKernArgSize()),
MaxKernArgAlign(MFI.getMaxKernArgAlign()), LDSSize(MFI.getLDSSize()),
GDSSize(MFI.getGDSSize()), DynLDSAlign(MFI.getDynLDSAlign()),
- IsEntryFunction(MFI.isEntryFunction()),
- NoSignedZerosFPMath(MFI.hasNoSignedZerosFPMath()),
- MemoryBound(MFI.isMemoryBound()), WaveLimiter(MFI.needsWaveLimiter()),
+ IsEntryFunction(MFI.isEntryFunction()), MemoryBound(MFI.isMemoryBound()),
+ WaveLimiter(MFI.needsWaveLimiter()),
HasSpilledSGPRs(MFI.hasSpilledSGPRs()),
HasSpilledVGPRs(MFI.hasSpilledVGPRs()),
NumWaveDispatchSGPRs(MFI.getNumWaveDispatchSGPRs()),
@@ -750,7 +763,8 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
Mode(MFI.getMode()), HasInitWholeWave(MFI.hasInitWholeWave()),
IsWholeWaveFunction(MFI.isWholeWaveFunction()),
DynamicVGPRBlockSize(MFI.getDynamicVGPRBlockSize()),
- ScratchReservedForDynamicVGPRs(MFI.getScratchReservedForDynamicVGPRs()) {
+ ScratchReservedForDynamicVGPRs(MFI.getScratchReservedForDynamicVGPRs()),
+ NumKernargPreloadSGPRs(MFI.getNumKernargPreloadedSGPRs()) {
for (Register Reg : MFI.getSGPRSpillPhysVGPRs())
SpillPhysVGPRS.push_back(regToString(Reg, TRI));
@@ -788,7 +802,6 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
HighBitsOf32BitAddress = YamlMFI.HighBitsOf32BitAddress;
Occupancy = YamlMFI.Occupancy;
IsEntryFunction = YamlMFI.IsEntryFunction;
- NoSignedZerosFPMath = YamlMFI.NoSignedZerosFPMath;
MemoryBound = YamlMFI.MemoryBound;
WaveLimiter = YamlMFI.WaveLimiter;
HasSpilledSGPRs = YamlMFI.HasSpilledSGPRs;
@@ -799,6 +812,8 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
ReturnsVoid = YamlMFI.ReturnsVoid;
IsWholeWaveFunction = YamlMFI.IsWholeWaveFunction;
+ UserSGPRInfo.allocKernargPreloadSGPRs(YamlMFI.NumKernargPreloadSGPRs);
+
if (YamlMFI.ScavengeFI) {
auto FIOrErr = YamlMFI.ScavengeFI->getFI(MF.getFrameInfo());
if (!FIOrErr) {
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 2c1a13c..617862d 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -170,6 +170,7 @@ struct SIArgumentInfo {
std::optional<SIArgument> DispatchID;
std::optional<SIArgument> FlatScratchInit;
std::optional<SIArgument> PrivateSegmentSize;
+ std::optional<SIArgument> FirstKernArgPreloadReg;
std::optional<SIArgument> WorkGroupIDX;
std::optional<SIArgument> WorkGroupIDY;
@@ -195,6 +196,7 @@ template <> struct MappingTraits<SIArgumentInfo> {
YamlIO.mapOptional("dispatchID", AI.DispatchID);
YamlIO.mapOptional("flatScratchInit", AI.FlatScratchInit);
YamlIO.mapOptional("privateSegmentSize", AI.PrivateSegmentSize);
+ YamlIO.mapOptional("firstKernArgPreloadReg", AI.FirstKernArgPreloadReg);
YamlIO.mapOptional("workGroupIDX", AI.WorkGroupIDX);
YamlIO.mapOptional("workGroupIDY", AI.WorkGroupIDY);
@@ -265,7 +267,6 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
Align DynLDSAlign;
bool IsEntryFunction = false;
bool IsChainFunction = false;
- bool NoSignedZerosFPMath = false;
bool MemoryBound = false;
bool WaveLimiter = false;
bool HasSpilledSGPRs = false;
@@ -305,13 +306,15 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
unsigned DynamicVGPRBlockSize = 0;
unsigned ScratchReservedForDynamicVGPRs = 0;
+ unsigned NumKernargPreloadSGPRs = 0;
+
SIMachineFunctionInfo() = default;
SIMachineFunctionInfo(const llvm::SIMachineFunctionInfo &,
const TargetRegisterInfo &TRI,
const llvm::MachineFunction &MF);
void mappingImpl(yaml::IO &YamlIO) override;
- ~SIMachineFunctionInfo() = default;
+ ~SIMachineFunctionInfo() override = default;
};
template <> struct MappingTraits<SIMachineFunctionInfo> {
@@ -324,7 +327,6 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
YamlIO.mapOptional("dynLDSAlign", MFI.DynLDSAlign, Align());
YamlIO.mapOptional("isEntryFunction", MFI.IsEntryFunction, false);
YamlIO.mapOptional("isChainFunction", MFI.IsChainFunction, false);
- YamlIO.mapOptional("noSignedZerosFPMath", MFI.NoSignedZerosFPMath, false);
YamlIO.mapOptional("memoryBound", MFI.MemoryBound, false);
YamlIO.mapOptional("waveLimiter", MFI.WaveLimiter, false);
YamlIO.mapOptional("hasSpilledSGPRs", MFI.HasSpilledSGPRs, false);
@@ -361,6 +363,7 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
YamlIO.mapOptional("dynamicVGPRBlockSize", MFI.DynamicVGPRBlockSize, false);
YamlIO.mapOptional("scratchReservedForDynamicVGPRs",
MFI.ScratchReservedForDynamicVGPRs, 0);
+ YamlIO.mapOptional("numKernargPreloadSGPRs", MFI.NumKernargPreloadSGPRs, 0);
YamlIO.mapOptional("isWholeWaveFunction", MFI.IsWholeWaveFunction, false);
}
};
@@ -1014,7 +1017,9 @@ public:
void setNumWaveDispatchVGPRs(unsigned Count) { NumWaveDispatchVGPRs = Count; }
Register getPrivateSegmentWaveByteOffsetSystemSGPR() const {
- return ArgInfo.PrivateSegmentWaveByteOffset.getRegister();
+ if (ArgInfo.PrivateSegmentWaveByteOffset)
+ return ArgInfo.PrivateSegmentWaveByteOffset.getRegister();
+ return MCRegister();
}
/// Returns the physical register reserved for use as the resource
diff --git a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
index fd28abe..fb0c7e6 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -323,8 +323,8 @@ void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
// Do not Track Physical Registers, because it messes up.
for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
- if (RegMaskPair.RegUnit.isVirtual())
- LiveInRegs.insert(RegMaskPair.RegUnit);
+ if (RegMaskPair.VRegOrUnit.isVirtualReg())
+ LiveInRegs.insert(RegMaskPair.VRegOrUnit.asVirtualReg());
}
LiveOutRegs.clear();
// There is several possibilities to distinguish:
@@ -350,12 +350,13 @@ void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
// Comparing to LiveInRegs is not sufficient to differentiate 4 vs 5, 7
// The use of findDefBetween removes the case 4.
for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
- Register Reg = RegMaskPair.RegUnit;
- if (Reg.isVirtual() &&
- isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(),
+ VirtRegOrUnit VRegOrUnit = RegMaskPair.VRegOrUnit;
+ if (VRegOrUnit.isVirtualReg() &&
+ isDefBetween(VRegOrUnit.asVirtualReg(),
+ LIS->getInstructionIndex(*BeginBlock).getRegSlot(),
LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI,
LIS)) {
- LiveOutRegs.insert(Reg);
+ LiveOutRegs.insert(VRegOrUnit.asVirtualReg());
}
}
@@ -578,11 +579,11 @@ void SIScheduleBlock::printDebug(bool full) {
<< LiveOutPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n\n";
dbgs() << "LiveIns:\n";
for (Register Reg : LiveInRegs)
- dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
+ dbgs() << printReg(Reg, DAG->getTRI()) << ' ';
dbgs() << "\nLiveOuts:\n";
for (Register Reg : LiveOutRegs)
- dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
+ dbgs() << printReg(Reg, DAG->getTRI()) << ' ';
}
dbgs() << "\nInstructions:\n";
@@ -921,7 +922,7 @@ void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
// combination of children.
PendingColoring[SU->NodeNum] = NextNonReservedID++;
}
- CurrentColoring = PendingColoring;
+ CurrentColoring = std::move(PendingColoring);
}
@@ -1446,23 +1447,24 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
}
#endif
- std::set<Register> InRegs = DAG->getInRegs();
+ std::set<VirtRegOrUnit> InRegs = DAG->getInRegs();
addLiveRegs(InRegs);
// Increase LiveOutRegsNumUsages for blocks
// producing registers consumed in another
// scheduling region.
- for (Register Reg : DAG->getOutRegs()) {
+ for (VirtRegOrUnit VRegOrUnit : DAG->getOutRegs()) {
for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
// Do reverse traversal
int ID = BlocksStruct.TopDownIndex2Block[Blocks.size()-1-i];
SIScheduleBlock *Block = Blocks[ID];
const std::set<Register> &OutRegs = Block->getOutRegs();
- if (OutRegs.find(Reg) == OutRegs.end())
+ if (!VRegOrUnit.isVirtualReg() ||
+ OutRegs.find(VRegOrUnit.asVirtualReg()) == OutRegs.end())
continue;
- ++LiveOutRegsNumUsages[ID][Reg];
+ ++LiveOutRegsNumUsages[ID][VRegOrUnit.asVirtualReg()];
break;
}
}
@@ -1565,15 +1567,18 @@ SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
maxVregUsage = VregCurrentUsage;
if (SregCurrentUsage > maxSregUsage)
maxSregUsage = SregCurrentUsage;
- LLVM_DEBUG(dbgs() << "Picking New Blocks\n"; dbgs() << "Available: ";
- for (SIScheduleBlock *Block : ReadyBlocks)
- dbgs() << Block->getID() << ' ';
- dbgs() << "\nCurrent Live:\n";
- for (Register Reg : LiveRegs)
- dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
- dbgs() << '\n';
- dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
- dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';);
+ LLVM_DEBUG({
+ dbgs() << "Picking New Blocks\n";
+ dbgs() << "Available: ";
+ for (SIScheduleBlock *Block : ReadyBlocks)
+ dbgs() << Block->getID() << ' ';
+ dbgs() << "\nCurrent Live:\n";
+ for (Register Reg : LiveRegs)
+ dbgs() << printReg(Reg, DAG->getTRI()) << ' ';
+ dbgs() << '\n';
+ dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
+ dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';
+ });
Cand.Block = nullptr;
for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(),
@@ -1625,13 +1630,13 @@ SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
// Tracking of currently alive registers to determine VGPR Usage.
-void SIScheduleBlockScheduler::addLiveRegs(std::set<Register> &Regs) {
- for (Register Reg : Regs) {
+void SIScheduleBlockScheduler::addLiveRegs(std::set<VirtRegOrUnit> &Regs) {
+ for (VirtRegOrUnit VRegOrUnit : Regs) {
// For now only track virtual registers.
- if (!Reg.isVirtual())
+ if (!VRegOrUnit.isVirtualReg())
continue;
// If not already in the live set, then add it.
- (void) LiveRegs.insert(Reg);
+ (void)LiveRegs.insert(VRegOrUnit.asVirtualReg());
}
}
@@ -1662,7 +1667,7 @@ void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) {
void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
decreaseLiveRegs(Block, Block->getInRegs());
- addLiveRegs(Block->getOutRegs());
+ LiveRegs.insert(Block->getOutRegs().begin(), Block->getOutRegs().end());
releaseBlockSuccs(Block);
for (const auto &RegP : LiveOutRegsNumUsages[Block->getID()]) {
// We produce this register, thus it must not be previously alive.
@@ -1689,7 +1694,7 @@ SIScheduleBlockScheduler::checkRegUsageImpact(std::set<Register> &InRegs,
continue;
if (LiveRegsConsumers[Reg] > 1)
continue;
- PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
+ PSetIterator PSetI = DAG->getMRI()->getPressureSets(VirtRegOrUnit(Reg));
for (; PSetI.isValid(); ++PSetI) {
DiffSetPressure[*PSetI] -= PSetI.getWeight();
}
@@ -1699,7 +1704,7 @@ SIScheduleBlockScheduler::checkRegUsageImpact(std::set<Register> &InRegs,
// For now only track virtual registers.
if (!Reg.isVirtual())
continue;
- PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
+ PSetIterator PSetI = DAG->getMRI()->getPressureSets(VirtRegOrUnit(Reg));
for (; PSetI.isValid(); ++PSetI) {
DiffSetPressure[*PSetI] += PSetI.getWeight();
}
@@ -1846,7 +1851,7 @@ SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
// For now only track virtual registers
if (!Reg.isVirtual())
continue;
- PSetIterator PSetI = MRI.getPressureSets(Reg);
+ PSetIterator PSetI = MRI.getPressureSets(VirtRegOrUnit(Reg));
for (; PSetI.isValid(); ++PSetI) {
if (*PSetI == AMDGPU::RegisterPressureSets::VGPR_32)
VgprUsage += PSetI.getWeight();
diff --git a/llvm/lib/Target/AMDGPU/SIMachineScheduler.h b/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
index b219cbd..1245774 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
@@ -389,7 +389,7 @@ private:
SIBlockSchedCandidate &TryCand);
SIScheduleBlock *pickBlock();
- void addLiveRegs(std::set<Register> &Regs);
+ void addLiveRegs(std::set<VirtRegOrUnit> &Regs);
void decreaseLiveRegs(SIScheduleBlock *Block, std::set<Register> &Regs);
void releaseBlockSuccs(SIScheduleBlock *Parent);
void blockScheduled(SIScheduleBlock *Block);
@@ -462,18 +462,18 @@ public:
unsigned &VgprUsage,
unsigned &SgprUsage);
- std::set<Register> getInRegs() {
- std::set<Register> InRegs;
+ std::set<VirtRegOrUnit> getInRegs() {
+ std::set<VirtRegOrUnit> InRegs;
for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
- InRegs.insert(RegMaskPair.RegUnit);
+ InRegs.insert(RegMaskPair.VRegOrUnit);
}
return InRegs;
}
- std::set<unsigned> getOutRegs() {
- std::set<unsigned> OutRegs;
+ std::set<VirtRegOrUnit> getOutRegs() {
+ std::set<VirtRegOrUnit> OutRegs;
for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
- OutRegs.insert(RegMaskPair.RegUnit);
+ OutRegs.insert(RegMaskPair.VRegOrUnit);
}
return OutRegs;
};
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 484861d..0daeecd 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -25,6 +25,7 @@
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
#include "llvm/IR/PassManager.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
#include "llvm/Support/AtomicOrdering.h"
#include "llvm/TargetParser/TargetParser.h"
@@ -277,6 +278,12 @@ public:
/// rmw operation, "std::nullopt" otherwise.
std::optional<SIMemOpInfo>
getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
+
+ /// \returns DMA to LDS info if \p MI is as a direct-to/from-LDS load/store,
+ /// along with an indication of whether this is a load or store. If it is not
+ /// a direct-to-LDS operation, returns std::nullopt.
+ std::optional<SIMemOpInfo>
+ getLDSDMAInfo(const MachineBasicBlock::iterator &MI) const;
};
class SICacheControl {
@@ -295,16 +302,17 @@ protected:
SICacheControl(const GCNSubtarget &ST);
- /// Sets named bit \p BitName to "true" if present in instruction \p MI.
+ /// Sets CPol \p Bits to "true" if present in instruction \p MI.
/// \returns Returns true if \p MI is modified, false otherwise.
- bool enableNamedBit(const MachineBasicBlock::iterator MI,
- AMDGPU::CPol::CPol Bit) const;
+ bool enableCPolBits(const MachineBasicBlock::iterator MI,
+ unsigned Bits) const;
/// Check if any atomic operation on AS can affect memory accessible via the
/// global address space.
bool canAffectGlobalAddrSpace(SIAtomicAddrSpace AS) const;
public:
+ using CPol = AMDGPU::CPol::CPol;
/// Create a cache control for the subtarget \p ST.
static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
@@ -360,11 +368,13 @@ public:
/// between memory instructions to enforce the order they become visible as
/// observed by other memory instructions executing in memory scope \p Scope.
/// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
- /// address spaces. Returns true iff any instructions inserted.
+ /// address spaces. If \p AtomicsOnly is true, only insert waits for counters
+ /// that are used by atomic instructions.
+ /// Returns true iff any instructions inserted.
virtual bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsCrossAddrSpaceOrdering, Position Pos,
- AtomicOrdering Order) const = 0;
+ AtomicOrdering Order, bool AtomicsOnly) const = 0;
/// Inserts any necessary instructions at position \p Pos relative to
/// instruction \p MI to ensure any subsequent memory instructions of this
@@ -388,31 +398,17 @@ public:
bool IsCrossAddrSpaceOrdering,
Position Pos) const = 0;
- /// Inserts any necessary instructions before the barrier start instruction
- /// \p MI in order to support pairing of barriers and fences.
- virtual bool insertBarrierStart(MachineBasicBlock::iterator &MI) const {
- return false;
- };
+ /// Handle operations that are considered non-volatile.
+ /// See \ref isNonVolatileMemoryAccess
+ virtual bool handleNonVolatile(MachineInstr &MI) const { return false; }
/// Virtual destructor to allow derivations to be deleted.
virtual ~SICacheControl() = default;
};
-class SIGfx6CacheControl : public SICacheControl {
-protected:
-
- /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
- /// is modified, false otherwise.
- bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
- return enableNamedBit(MI, AMDGPU::CPol::GLC);
- }
-
- /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
- /// is modified, false otherwise.
- bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
- return enableNamedBit(MI, AMDGPU::CPol::SLC);
- }
-
+/// Generates code sequences for the memory model of all GFX targets below
+/// GFX10.
+class SIGfx6CacheControl final : public SICacheControl {
public:
SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
@@ -437,7 +433,7 @@ public:
bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsCrossAddrSpaceOrdering, Position Pos,
- AtomicOrdering Order) const override;
+ AtomicOrdering Order, bool AtomicsOnly) const override;
bool insertAcquire(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
@@ -451,30 +447,26 @@ public:
Position Pos) const override;
};
-class SIGfx7CacheControl : public SIGfx6CacheControl {
-public:
-
- SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
-
- bool insertAcquire(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- Position Pos) const override;
-
-};
-
-class SIGfx90ACacheControl : public SIGfx7CacheControl {
+/// Generates code sequences for the memory model of GFX10/11.
+class SIGfx10CacheControl final : public SICacheControl {
public:
-
- SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
+ SIGfx10CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace) const override;
+ bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const override {
+ return false;
+ }
+
bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace) const override;
+ SIAtomicAddrSpace AddrSpace) const override {
+ return false;
+ }
bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
@@ -484,124 +476,27 @@ public:
bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsCrossAddrSpaceOrdering, Position Pos,
- AtomicOrdering Order) const override;
-
- bool insertAcquire(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- Position Pos) const override;
-
- bool insertRelease(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- bool IsCrossAddrSpaceOrdering,
- Position Pos) const override;
-};
-
-class SIGfx940CacheControl : public SIGfx90ACacheControl {
-protected:
-
- /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI
- /// is modified, false otherwise.
- bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {
- return enableNamedBit(MI, AMDGPU::CPol::SC0);
- }
-
- /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI
- /// is modified, false otherwise.
- bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {
- return enableNamedBit(MI, AMDGPU::CPol::SC1);
- }
-
- /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI
- /// is modified, false otherwise.
- bool enableNTBit(const MachineBasicBlock::iterator &MI) const {
- return enableNamedBit(MI, AMDGPU::CPol::NT);
- }
-
-public:
- SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {};
-
- bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace) const override;
-
- bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace) const override;
-
- bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace) const override;
-
- bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
- SIAtomicAddrSpace AddrSpace, SIMemOp Op,
- bool IsVolatile, bool IsNonTemporal,
- bool IsLastUse) const override;
+ AtomicOrdering Order, bool AtomicsOnly) const override;
bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, Position Pos) const override;
bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
- Position Pos) const override;
-};
-
-class SIGfx10CacheControl : public SIGfx7CacheControl {
-protected:
-
- /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
- /// is modified, false otherwise.
- bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
- return enableNamedBit(MI, AMDGPU::CPol::DLC);
+ Position Pos) const override {
+ return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
+ IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
+ /*AtomicsOnly=*/false);
}
-
-public:
-
- SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
-
- bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace) const override;
-
- bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
- SIAtomicAddrSpace AddrSpace, SIMemOp Op,
- bool IsVolatile, bool IsNonTemporal,
- bool IsLastUse) const override;
-
- bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace, SIMemOp Op,
- bool IsCrossAddrSpaceOrdering, Position Pos,
- AtomicOrdering Order) const override;
-
- bool insertAcquire(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- Position Pos) const override;
-
- bool insertBarrierStart(MachineBasicBlock::iterator &MI) const override;
-};
-
-class SIGfx11CacheControl : public SIGfx10CacheControl {
-public:
- SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {}
-
- bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace) const override;
-
- bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
- SIAtomicAddrSpace AddrSpace, SIMemOp Op,
- bool IsVolatile, bool IsNonTemporal,
- bool IsLastUse) const override;
};
-class SIGfx12CacheControl : public SIGfx11CacheControl {
+class SIGfx12CacheControl final : public SICacheControl {
protected:
// Sets TH policy to \p Value if CPol operand is present in instruction \p MI.
// \returns Returns true if \p MI is modified, false otherwise.
bool setTH(const MachineBasicBlock::iterator MI,
AMDGPU::CPol::CPol Value) const;
+
// Sets Scope policy to \p Value if CPol operand is present in instruction \p
// MI. \returns Returns true if \p MI is modified, false otherwise.
bool setScope(const MachineBasicBlock::iterator MI,
@@ -620,16 +515,16 @@ protected:
SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const;
public:
- SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {
- // GFX12.0 and GFX12.5 memory models greatly overlap, and in some cases
- // the behavior is the same if assuming GFX12.0 in CU mode.
- assert(!ST.hasGFX1250Insts() || ST.isCuModeEnabled());
+ SIGfx12CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {
+ // GFX120x and GFX125x memory models greatly overlap, and in some cases
+ // the behavior is the same if assuming GFX120x in CU mode.
+ assert(!ST.hasGFX1250Insts() || ST.hasGFX13Insts() || ST.isCuModeEnabled());
}
bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsCrossAddrSpaceOrdering, Position Pos,
- AtomicOrdering Order) const override;
+ AtomicOrdering Order, bool AtomicsOnly) const override;
bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, Position Pos) const override;
@@ -641,7 +536,7 @@ public:
bool finalizeStore(MachineInstr &MI, bool Atomic) const override;
- virtual bool handleCooperativeAtomic(MachineInstr &MI) const override;
+ bool handleCooperativeAtomic(MachineInstr &MI) const override;
bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
@@ -664,6 +559,8 @@ public:
SIAtomicAddrSpace AddrSpace) const override {
return setAtomicScope(MI, Scope, AddrSpace);
}
+
+ bool handleNonVolatile(MachineInstr &MI) const override;
};
class SIMemoryLegalizer final {
@@ -701,6 +598,9 @@ private:
/// instructions are added/deleted or \p MI is modified, false otherwise.
bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
MachineBasicBlock::iterator &MI);
+ /// Expands LDS DMA operation \p MI. Returns true if instructions are
+ /// added/deleted or \p MI is modified, false otherwise.
+ bool expandLDSDMA(const SIMemOpInfo &MOI, MachineBasicBlock::iterator &MI);
public:
SIMemoryLegalizer(const MachineModuleInfo &MMI) : MMI(MMI) {};
@@ -775,7 +675,7 @@ getSynchronizeAddrSpaceMD(const MachineInstr &MI) {
void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
const char *Msg) const {
- const Function &Func = MI->getParent()->getParent()->getFunction();
+ const Function &Func = MI->getMF()->getFunction();
Func.getContext().diagnose(
DiagnosticInfoUnsupported(Func, Msg, MI->getDebugLoc()));
}
@@ -830,6 +730,9 @@ SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
return SIAtomicAddrSpace::SCRATCH;
if (AS == AMDGPUAS::REGION_ADDRESS)
return SIAtomicAddrSpace::GDS;
+ if (AS == AMDGPUAS::BUFFER_FAT_POINTER || AS == AMDGPUAS::BUFFER_RESOURCE ||
+ AS == AMDGPUAS::BUFFER_STRIDED_POINTER)
+ return SIAtomicAddrSpace::GLOBAL;
return SIAtomicAddrSpace::OTHER;
}
@@ -879,6 +782,13 @@ std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
}
}
+ // FIXME: The MMO of buffer atomic instructions does not always have an atomic
+ // ordering. We only need to handle VBUFFER atomics on GFX12+ so we can fix it
+ // here, but the lowering should really be cleaned up at some point.
+ if ((ST.getGeneration() >= GCNSubtarget::GFX12) && SIInstrInfo::isBUF(*MI) &&
+ SIInstrInfo::isAtomic(*MI) && Ordering == AtomicOrdering::NotAtomic)
+ Ordering = AtomicOrdering::Monotonic;
+
SIAtomicScope Scope = SIAtomicScope::NONE;
SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
bool IsCrossAddressSpaceOrdering = false;
@@ -985,19 +895,41 @@ std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
return constructFromMIWithMMO(MI);
}
+std::optional<SIMemOpInfo>
+SIMemOpAccess::getLDSDMAInfo(const MachineBasicBlock::iterator &MI) const {
+ assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
+
+ if (!SIInstrInfo::isLDSDMA(*MI))
+ return std::nullopt;
+
+ return constructFromMIWithMMO(MI);
+}
+
+/// \returns true if \p MI has one or more MMO, and all of them are fit for
+/// being marked as non-volatile. This means that either they are accessing the
+/// constant address space, are accessing a known invariant memory location, or
+/// that they are marked with the non-volatile metadata/MMO flag.
+static bool isNonVolatileMemoryAccess(const MachineInstr &MI) {
+ if (MI.getNumMemOperands() == 0)
+ return false;
+ return all_of(MI.memoperands(), [&](const MachineMemOperand *MMO) {
+ return MMO->getFlags() & (MOThreadPrivate | MachineMemOperand::MOInvariant);
+ });
+}
+
SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
TII = ST.getInstrInfo();
IV = getIsaVersion(ST.getCPU());
InsertCacheInv = !AmdgcnSkipCacheInvalidations;
}
-bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
- AMDGPU::CPol::CPol Bit) const {
+bool SICacheControl::enableCPolBits(const MachineBasicBlock::iterator MI,
+ unsigned Bits) const {
MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
if (!CPol)
return false;
- CPol->setImm(CPol->getImm() | Bit);
+ CPol->setImm(CPol->getImm() | Bits);
return true;
}
@@ -1013,18 +945,10 @@ bool SICacheControl::canAffectGlobalAddrSpace(SIAtomicAddrSpace AS) const {
/* static */
std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
GCNSubtarget::Generation Generation = ST.getGeneration();
- if (ST.hasGFX940Insts())
- return std::make_unique<SIGfx940CacheControl>(ST);
- if (ST.hasGFX90AInsts())
- return std::make_unique<SIGfx90ACacheControl>(ST);
- if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
- return std::make_unique<SIGfx6CacheControl>(ST);
if (Generation < AMDGPUSubtarget::GFX10)
- return std::make_unique<SIGfx7CacheControl>(ST);
- if (Generation < AMDGPUSubtarget::GFX11)
- return std::make_unique<SIGfx10CacheControl>(ST);
+ return std::make_unique<SIGfx6CacheControl>(ST);
if (Generation < AMDGPUSubtarget::GFX12)
- return std::make_unique<SIGfx11CacheControl>(ST);
+ return std::make_unique<SIGfx10CacheControl>(ST);
return std::make_unique<SIGfx12CacheControl>(ST);
}
@@ -1033,33 +957,61 @@ bool SIGfx6CacheControl::enableLoadCacheBypass(
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace) const {
assert(MI->mayLoad() && !MI->mayStore());
- bool Changed = false;
- if (canAffectGlobalAddrSpace(AddrSpace)) {
- switch (Scope) {
- case SIAtomicScope::SYSTEM:
- case SIAtomicScope::AGENT:
+ if (!canAffectGlobalAddrSpace(AddrSpace)) {
+ /// The scratch address space does not need the global memory caches
+ /// to be bypassed as all memory operations by the same thread are
+ /// sequentially consistent, and no other thread can access scratch
+ /// memory.
+
+ /// Other address spaces do not have a cache.
+ return false;
+ }
+
+ bool Changed = false;
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ if (ST.hasGFX940Insts()) {
+ // Set SC bits to indicate system scope.
+ Changed |= enableCPolBits(MI, CPol::SC0 | CPol::SC1);
+ break;
+ }
+ [[fallthrough]];
+ case SIAtomicScope::AGENT:
+ if (ST.hasGFX940Insts()) {
+ // Set SC bits to indicate agent scope.
+ Changed |= enableCPolBits(MI, CPol::SC1);
+ } else {
// Set L1 cache policy to MISS_EVICT.
// Note: there is no L2 cache bypass policy at the ISA level.
- Changed |= enableGLCBit(MI);
- break;
- case SIAtomicScope::WORKGROUP:
- case SIAtomicScope::WAVEFRONT:
- case SIAtomicScope::SINGLETHREAD:
- // No cache to bypass.
- break;
- default:
- llvm_unreachable("Unsupported synchronization scope");
+ Changed |= enableCPolBits(MI, CPol::GLC);
}
+ break;
+ case SIAtomicScope::WORKGROUP:
+ if (ST.hasGFX940Insts()) {
+ // In threadgroup split mode the waves of a work-group can be executing
+ // on different CUs. Therefore need to bypass the L1 which is per CU.
+ // Otherwise in non-threadgroup split mode all waves of a work-group are
+ // on the same CU, and so the L1 does not need to be bypassed. Setting
+ // SC bits to indicate work-group scope will do this automatically.
+ Changed |= enableCPolBits(MI, CPol::SC0);
+ } else if (ST.hasGFX90AInsts()) {
+ // In threadgroup split mode the waves of a work-group can be executing
+ // on different CUs. Therefore need to bypass the L1 which is per CU.
+ // Otherwise in non-threadgroup split mode all waves of a work-group are
+ // on the same CU, and so the L1 does not need to be bypassed.
+ if (ST.isTgSplitEnabled())
+ Changed |= enableCPolBits(MI, CPol::GLC);
+ }
+ break;
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // No cache to bypass.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
}
- /// The scratch address space does not need the global memory caches
- /// to be bypassed as all memory operations by the same thread are
- /// sequentially consistent, and no other thread can access scratch
- /// memory.
-
- /// Other address spaces do not have a cache.
-
return Changed;
}
@@ -1070,8 +1022,39 @@ bool SIGfx6CacheControl::enableStoreCacheBypass(
assert(!MI->mayLoad() && MI->mayStore());
bool Changed = false;
- /// The L1 cache is write through so does not need to be bypassed. There is no
- /// bypass control for the L2 cache at the isa level.
+ /// For targets other than GFX940, the L1 cache is write through so does not
+ /// need to be bypassed. There is no bypass control for the L2 cache at the
+ /// isa level.
+
+ if (ST.hasGFX940Insts() && canAffectGlobalAddrSpace(AddrSpace)) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ // Set SC bits to indicate system scope.
+ Changed |= enableCPolBits(MI, CPol::SC0 | CPol::SC1);
+ break;
+ case SIAtomicScope::AGENT:
+ // Set SC bits to indicate agent scope.
+ Changed |= enableCPolBits(MI, CPol::SC1);
+ break;
+ case SIAtomicScope::WORKGROUP:
+ // Set SC bits to indicate workgroup scope.
+ Changed |= enableCPolBits(MI, CPol::SC0);
+ break;
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // Leave SC bits unset to indicate wavefront scope.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+
+ /// The scratch address space does not need the global memory caches
+ /// to be bypassed as all memory operations by the same thread are
+ /// sequentially consistent, and no other thread can access scratch
+ /// memory.
+
+ /// Other address spaces do not have a cache.
+ }
return Changed;
}
@@ -1083,10 +1066,31 @@ bool SIGfx6CacheControl::enableRMWCacheBypass(
assert(MI->mayLoad() && MI->mayStore());
bool Changed = false;
- /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
- /// bypassed, and the GLC bit is instead used to indicate if they are
- /// return or no-return.
- /// Note: there is no L2 cache coherent bypass control at the ISA level.
+ /// For targets other than GFX940, do not set GLC for RMW atomic operations as
+ /// L0/L1 cache is automatically bypassed, and the GLC bit is instead used to
+ /// indicate if they are return or no-return. Note: there is no L2 cache
+ /// coherent bypass control at the ISA level.
+ /// For GFX90A+, RMW atomics implicitly bypass the L1 cache.
+
+ if (ST.hasGFX940Insts() && canAffectGlobalAddrSpace(AddrSpace)) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ // Set SC1 bit to indicate system scope.
+ Changed |= enableCPolBits(MI, CPol::SC1);
+ break;
+ case SIAtomicScope::AGENT:
+ case SIAtomicScope::WORKGROUP:
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // RMW atomic operations implicitly bypass the L1 cache and only use SC1
+ // to indicate system or agent scope. The SC0 bit is used to indicate if
+ // they are return or no-return. Leave SC1 bit unset to indicate agent
+ // scope.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
return Changed;
}
@@ -1097,7 +1101,7 @@ bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
// Only handle load and store, not atomic read-modify-write insructions. The
// latter use glc to indicate if the atomic returns a result and so must not
// be used for cache control.
- assert(MI->mayLoad() ^ MI->mayStore());
+ assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
// Only update load and store, not LLVM IR atomic read-modify-write
// instructions. The latter are always marked as volatile so cannot sensibly
@@ -1108,11 +1112,15 @@ bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
bool Changed = false;
if (IsVolatile) {
- // Set L1 cache policy to be MISS_EVICT for load instructions
- // and MISS_LRU for store instructions.
- // Note: there is no L2 cache bypass policy at the ISA level.
- if (Op == SIMemOp::LOAD)
- Changed |= enableGLCBit(MI);
+ if (ST.hasGFX940Insts()) {
+ // Set SC bits to indicate system scope.
+ Changed |= enableCPolBits(MI, CPol::SC0 | CPol::SC1);
+ } else if (Op == SIMemOp::LOAD) {
+ // Set L1 cache policy to be MISS_EVICT for load instructions
+ // and MISS_LRU for store instructions.
+ // Note: there is no L2 cache bypass policy at the ISA level.
+ Changed |= enableCPolBits(MI, CPol::GLC);
+ }
// Ensure operation has completed at system scope to cause all volatile
// operations to be visible outside the program in a global order. Do not
@@ -1120,16 +1128,20 @@ bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
// observable outside the program, so no need to cause a waitcnt for LDS
// address space operations.
Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
- Position::AFTER, AtomicOrdering::Unordered);
+ Position::AFTER, AtomicOrdering::Unordered,
+ /*AtomicsOnly=*/false);
return Changed;
}
if (IsNonTemporal) {
- // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
- // for both loads and stores, and the L2 cache policy to STREAM.
- Changed |= enableGLCBit(MI);
- Changed |= enableSLCBit(MI);
+ if (ST.hasGFX940Insts()) {
+ Changed |= enableCPolBits(MI, CPol::NT);
+ } else {
+ // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
+ // for both loads and stores, and the L2 cache policy to STREAM.
+ Changed |= enableCPolBits(MI, CPol::SLC | CPol::GLC);
+ }
return Changed;
}
@@ -1140,15 +1152,36 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsCrossAddrSpaceOrdering, Position Pos,
- AtomicOrdering Order) const {
+ AtomicOrdering Order,
+ bool AtomicsOnly) const {
bool Changed = false;
MachineBasicBlock &MBB = *MI->getParent();
- DebugLoc DL = MI->getDebugLoc();
+ const DebugLoc &DL = MI->getDebugLoc();
if (Pos == Position::AFTER)
++MI;
+ // GFX90A+
+ if (ST.hasGFX90AInsts() && ST.isTgSplitEnabled()) {
+ // In threadgroup split mode the waves of a work-group can be executing on
+ // different CUs. Therefore need to wait for global or GDS memory operations
+ // to complete to ensure they are visible to waves in the other CUs.
+ // Otherwise in non-threadgroup split mode all waves of a work-group are on
+ // the same CU, so no need to wait for global memory as all waves in the
+ // work-group access the same the L1, nor wait for GDS as access are ordered
+ // on a CU.
+ if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
+ SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
+ (Scope == SIAtomicScope::WORKGROUP)) {
+ // Same as <GFX90A at AGENT scope;
+ Scope = SIAtomicScope::AGENT;
+ }
+ // In threadgroup split mode LDS cannot be allocated so no need to wait for
+ // LDS memory operations.
+ AddrSpace &= ~SIAtomicAddrSpace::LDS;
+ }
+
bool VMCnt = false;
bool LGKMCnt = false;
@@ -1243,61 +1276,13 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
return Changed;
}
-bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- Position Pos) const {
- if (!InsertCacheInv)
+static bool canUseBUFFER_WBINVL1_VOL(const GCNSubtarget &ST) {
+ if (ST.getGeneration() <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
return false;
-
- bool Changed = false;
-
- MachineBasicBlock &MBB = *MI->getParent();
- DebugLoc DL = MI->getDebugLoc();
-
- if (Pos == Position::AFTER)
- ++MI;
-
- if (canAffectGlobalAddrSpace(AddrSpace)) {
- switch (Scope) {
- case SIAtomicScope::SYSTEM:
- case SIAtomicScope::AGENT:
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
- Changed = true;
- break;
- case SIAtomicScope::WORKGROUP:
- case SIAtomicScope::WAVEFRONT:
- case SIAtomicScope::SINGLETHREAD:
- // No cache to invalidate.
- break;
- default:
- llvm_unreachable("Unsupported synchronization scope");
- }
- }
-
- /// The scratch address space does not need the global memory cache
- /// to be flushed as all memory operations by the same thread are
- /// sequentially consistent, and no other thread can access scratch
- /// memory.
-
- /// Other address spaces do not have a cache.
-
- if (Pos == Position::AFTER)
- --MI;
-
- return Changed;
-}
-
-bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- bool IsCrossAddrSpaceOrdering,
- Position Pos) const {
- return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
- IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
+ return !ST.isAmdPalOS() && !ST.isMesa3DOS();
}
-bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
+bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
Position Pos) const {
@@ -1307,235 +1292,97 @@ bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
bool Changed = false;
MachineBasicBlock &MBB = *MI->getParent();
- DebugLoc DL = MI->getDebugLoc();
-
- const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
-
- const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
- ? AMDGPU::BUFFER_WBINVL1
- : AMDGPU::BUFFER_WBINVL1_VOL;
+ const DebugLoc &DL = MI->getDebugLoc();
if (Pos == Position::AFTER)
++MI;
- if (canAffectGlobalAddrSpace(AddrSpace)) {
- switch (Scope) {
- case SIAtomicScope::SYSTEM:
- case SIAtomicScope::AGENT:
- BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
- Changed = true;
- break;
- case SIAtomicScope::WORKGROUP:
- case SIAtomicScope::WAVEFRONT:
- case SIAtomicScope::SINGLETHREAD:
- // No cache to invalidate.
- break;
- default:
- llvm_unreachable("Unsupported synchronization scope");
- }
- }
-
- /// The scratch address space does not need the global memory cache
- /// to be flushed as all memory operations by the same thread are
- /// sequentially consistent, and no other thread can access scratch
- /// memory.
-
- /// Other address spaces do not have a cache.
-
- if (Pos == Position::AFTER)
- --MI;
-
- return Changed;
-}
-
-bool SIGfx90ACacheControl::enableLoadCacheBypass(
- const MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace) const {
- assert(MI->mayLoad() && !MI->mayStore());
- bool Changed = false;
+ const unsigned InvalidateL1 = canUseBUFFER_WBINVL1_VOL(ST)
+ ? AMDGPU::BUFFER_WBINVL1_VOL
+ : AMDGPU::BUFFER_WBINVL1;
if (canAffectGlobalAddrSpace(AddrSpace)) {
switch (Scope) {
case SIAtomicScope::SYSTEM:
- case SIAtomicScope::AGENT:
- // Set the L1 cache policy to MISS_LRU.
- // Note: there is no L2 cache bypass policy at the ISA level.
- Changed |= enableGLCBit(MI);
- break;
- case SIAtomicScope::WORKGROUP:
- // In threadgroup split mode the waves of a work-group can be executing on
- // different CUs. Therefore need to bypass the L1 which is per CU.
- // Otherwise in non-threadgroup split mode all waves of a work-group are
- // on the same CU, and so the L1 does not need to be bypassed.
- if (ST.isTgSplitEnabled())
- Changed |= enableGLCBit(MI);
- break;
- case SIAtomicScope::WAVEFRONT:
- case SIAtomicScope::SINGLETHREAD:
- // No cache to bypass.
- break;
- default:
- llvm_unreachable("Unsupported synchronization scope");
- }
- }
-
- /// The scratch address space does not need the global memory caches
- /// to be bypassed as all memory operations by the same thread are
- /// sequentially consistent, and no other thread can access scratch
- /// memory.
-
- /// Other address spaces do not have a cache.
-
- return Changed;
-}
-
-bool SIGfx90ACacheControl::enableRMWCacheBypass(
- const MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace) const {
- assert(MI->mayLoad() && MI->mayStore());
- bool Changed = false;
+ if (ST.hasGFX940Insts()) {
+ // Ensures that following loads will not see stale remote VMEM data or
+ // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW
+ // and CC will never be stale due to the local memory probes.
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
+ // Set SC bits to indicate system scope.
+ .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
+ // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
+ // hardware does not reorder memory operations by the same wave with
+ // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
+ // remove any cache lines of earlier writes by the same wave and ensures
+ // later reads by the same wave will refetch the cache lines.
+ Changed = true;
+ break;
+ }
- if (canAffectGlobalAddrSpace(AddrSpace)) {
- switch (Scope) {
- case SIAtomicScope::SYSTEM:
+ if (ST.hasGFX90AInsts()) {
+ // Ensures that following loads will not see stale remote VMEM data or
+ // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW
+ // and CC will never be stale due to the local memory probes.
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
+ BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
+ // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
+ // hardware does not reorder memory operations by the same wave with
+ // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed
+ // to remove any cache lines of earlier writes by the same wave and
+ // ensures later reads by the same wave will refetch the cache lines.
+ Changed = true;
+ break;
+ }
+ [[fallthrough]];
case SIAtomicScope::AGENT:
- /// Do not set glc for RMW atomic operations as they implicitly bypass
- /// the L1 cache, and the glc bit is instead used to indicate if they are
- /// return or no-return.
- break;
- case SIAtomicScope::WORKGROUP:
- case SIAtomicScope::WAVEFRONT:
- case SIAtomicScope::SINGLETHREAD:
- // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
- break;
- default:
- llvm_unreachable("Unsupported synchronization scope");
- }
- }
-
- return Changed;
-}
-
-bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
- MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
- bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
- // Only handle load and store, not atomic read-modify-write insructions. The
- // latter use glc to indicate if the atomic returns a result and so must not
- // be used for cache control.
- assert(MI->mayLoad() ^ MI->mayStore());
-
- // Only update load and store, not LLVM IR atomic read-modify-write
- // instructions. The latter are always marked as volatile so cannot sensibly
- // handle it as do not want to pessimize all atomics. Also they do not support
- // the nontemporal attribute.
- assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
-
- bool Changed = false;
-
- if (IsVolatile) {
- // Set L1 cache policy to be MISS_EVICT for load instructions
- // and MISS_LRU for store instructions.
- // Note: there is no L2 cache bypass policy at the ISA level.
- if (Op == SIMemOp::LOAD)
- Changed |= enableGLCBit(MI);
-
- // Ensure operation has completed at system scope to cause all volatile
- // operations to be visible outside the program in a global order. Do not
- // request cross address space as only the global address space can be
- // observable outside the program, so no need to cause a waitcnt for LDS
- // address space operations.
- Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
- Position::AFTER, AtomicOrdering::Unordered);
-
- return Changed;
- }
-
- if (IsNonTemporal) {
- // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
- // for both loads and stores, and the L2 cache policy to STREAM.
- Changed |= enableGLCBit(MI);
- Changed |= enableSLCBit(MI);
- return Changed;
- }
-
- return Changed;
-}
-
-bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace, SIMemOp Op,
- bool IsCrossAddrSpaceOrdering,
- Position Pos,
- AtomicOrdering Order) const {
- if (ST.isTgSplitEnabled()) {
- // In threadgroup split mode the waves of a work-group can be executing on
- // different CUs. Therefore need to wait for global or GDS memory operations
- // to complete to ensure they are visible to waves in the other CUs.
- // Otherwise in non-threadgroup split mode all waves of a work-group are on
- // the same CU, so no need to wait for global memory as all waves in the
- // work-group access the same the L1, nor wait for GDS as access are ordered
- // on a CU.
- if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
- SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
- (Scope == SIAtomicScope::WORKGROUP)) {
- // Same as GFX7 using agent scope.
- Scope = SIAtomicScope::AGENT;
- }
- // In threadgroup split mode LDS cannot be allocated so no need to wait for
- // LDS memory operations.
- AddrSpace &= ~SIAtomicAddrSpace::LDS;
- }
- return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
- IsCrossAddrSpaceOrdering, Pos, Order);
-}
-
-bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- Position Pos) const {
- if (!InsertCacheInv)
- return false;
-
- bool Changed = false;
-
- MachineBasicBlock &MBB = *MI->getParent();
- DebugLoc DL = MI->getDebugLoc();
-
- if (Pos == Position::AFTER)
- ++MI;
-
- if (canAffectGlobalAddrSpace(AddrSpace)) {
- switch (Scope) {
- case SIAtomicScope::SYSTEM:
- // Ensures that following loads will not see stale remote VMEM data or
- // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
- // CC will never be stale due to the local memory probes.
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
- // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
- // hardware does not reorder memory operations by the same wave with
- // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
- // remove any cache lines of earlier writes by the same wave and ensures
- // later reads by the same wave will refetch the cache lines.
+ if (ST.hasGFX940Insts()) {
+ // Ensures that following loads will not see stale remote date or local
+ // MTYPE NC global data. Local MTYPE RW and CC memory will never be
+ // stale due to the memory probes.
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
+ // Set SC bits to indicate agent scope.
+ .addImm(AMDGPU::CPol::SC1);
+ // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
+ // does not reorder memory operations with respect to preceeding buffer
+ // invalidate. The invalidate is guaranteed to remove any cache lines of
+ // earlier writes and ensures later writes will refetch the cache lines.
+ } else
+ BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
Changed = true;
break;
- case SIAtomicScope::AGENT:
- // Same as GFX7.
- break;
case SIAtomicScope::WORKGROUP:
- // In threadgroup split mode the waves of a work-group can be executing on
- // different CUs. Therefore need to invalidate the L1 which is per CU.
- // Otherwise in non-threadgroup split mode all waves of a work-group are
- // on the same CU, and so the L1 does not need to be invalidated.
if (ST.isTgSplitEnabled()) {
- // Same as GFX7 using agent scope.
- Scope = SIAtomicScope::AGENT;
+ if (ST.hasGFX940Insts()) {
+ // In threadgroup split mode the waves of a work-group can be
+ // executing on different CUs. Therefore need to invalidate the L1
+ // which is per CU. Otherwise in non-threadgroup split mode all waves
+ // of a work-group are on the same CU, and so the L1 does not need to
+ // be invalidated.
+
+ // Ensures L1 is invalidated if in threadgroup split mode. In
+ // non-threadgroup split mode it is a NOP, but no point generating it
+ // in that case if know not in that mode.
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
+ // Set SC bits to indicate work-group scope.
+ .addImm(AMDGPU::CPol::SC0);
+ // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
+ // does not reorder memory operations with respect to preceeding
+ // buffer invalidate. The invalidate is guaranteed to remove any cache
+ // lines of earlier writes and ensures later writes will refetch the
+ // cache lines.
+ Changed = true;
+ } else if (ST.hasGFX90AInsts()) {
+ BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
+ Changed = true;
+ }
}
break;
case SIAtomicScope::WAVEFRONT:
case SIAtomicScope::SINGLETHREAD:
- // Same as GFX7.
+ // For GFX940, we could generate "BUFFER_INV" but it would do nothing as
+ // there are no caches to invalidate. All other targets have no cache to
+ // invalidate.
break;
default:
llvm_unreachable("Unsupported synchronization scope");
@@ -1552,366 +1399,76 @@ bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
if (Pos == Position::AFTER)
--MI;
- Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
-
return Changed;
}
-bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- bool IsCrossAddrSpaceOrdering,
- Position Pos) const {
- bool Changed = false;
-
- MachineBasicBlock &MBB = *MI->getParent();
- const DebugLoc &DL = MI->getDebugLoc();
-
- if (Pos == Position::AFTER)
- ++MI;
-
- if (canAffectGlobalAddrSpace(AddrSpace)) {
- switch (Scope) {
- case SIAtomicScope::SYSTEM:
- // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
- // hardware does not reorder memory operations by the same wave with
- // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
- // to initiate writeback of any dirty cache lines of earlier writes by the
- // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
- // writeback has completed.
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
- // Set SC bits to indicate system scope.
- .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
- // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
- // vmcnt(0)" needed by the "BUFFER_WBL2".
- Changed = true;
- break;
- case SIAtomicScope::AGENT:
- case SIAtomicScope::WORKGROUP:
- case SIAtomicScope::WAVEFRONT:
- case SIAtomicScope::SINGLETHREAD:
- // Same as GFX7.
- break;
- default:
- llvm_unreachable("Unsupported synchronization scope");
- }
- }
-
- if (Pos == Position::AFTER)
- --MI;
-
- Changed |=
- SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
- IsCrossAddrSpaceOrdering, Pos);
-
- return Changed;
-}
-
-bool SIGfx940CacheControl::enableLoadCacheBypass(
- const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace) const {
- assert(MI->mayLoad() && !MI->mayStore());
- bool Changed = false;
-
- if (canAffectGlobalAddrSpace(AddrSpace)) {
- switch (Scope) {
- case SIAtomicScope::SYSTEM:
- // Set SC bits to indicate system scope.
- Changed |= enableSC0Bit(MI);
- Changed |= enableSC1Bit(MI);
- break;
- case SIAtomicScope::AGENT:
- // Set SC bits to indicate agent scope.
- Changed |= enableSC1Bit(MI);
- break;
- case SIAtomicScope::WORKGROUP:
- // In threadgroup split mode the waves of a work-group can be executing on
- // different CUs. Therefore need to bypass the L1 which is per CU.
- // Otherwise in non-threadgroup split mode all waves of a work-group are
- // on the same CU, and so the L1 does not need to be bypassed. Setting SC
- // bits to indicate work-group scope will do this automatically.
- Changed |= enableSC0Bit(MI);
- break;
- case SIAtomicScope::WAVEFRONT:
- case SIAtomicScope::SINGLETHREAD:
- // Leave SC bits unset to indicate wavefront scope.
- break;
- default:
- llvm_unreachable("Unsupported synchronization scope");
- }
- }
-
- /// The scratch address space does not need the global memory caches
- /// to be bypassed as all memory operations by the same thread are
- /// sequentially consistent, and no other thread can access scratch
- /// memory.
-
- /// Other address spaces do not have a cache.
-
- return Changed;
-}
-
-bool SIGfx940CacheControl::enableStoreCacheBypass(
- const MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {
- assert(!MI->mayLoad() && MI->mayStore());
- bool Changed = false;
-
- if (canAffectGlobalAddrSpace(AddrSpace)) {
- switch (Scope) {
- case SIAtomicScope::SYSTEM:
- // Set SC bits to indicate system scope.
- Changed |= enableSC0Bit(MI);
- Changed |= enableSC1Bit(MI);
- break;
- case SIAtomicScope::AGENT:
- // Set SC bits to indicate agent scope.
- Changed |= enableSC1Bit(MI);
- break;
- case SIAtomicScope::WORKGROUP:
- // Set SC bits to indicate workgroup scope.
- Changed |= enableSC0Bit(MI);
- break;
- case SIAtomicScope::WAVEFRONT:
- case SIAtomicScope::SINGLETHREAD:
- // Leave SC bits unset to indicate wavefront scope.
- break;
- default:
- llvm_unreachable("Unsupported synchronization scope");
- }
- }
-
- /// The scratch address space does not need the global memory caches
- /// to be bypassed as all memory operations by the same thread are
- /// sequentially consistent, and no other thread can access scratch
- /// memory.
-
- /// Other address spaces do not have a cache.
-
- return Changed;
-}
-
-bool SIGfx940CacheControl::enableRMWCacheBypass(
- const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace) const {
- assert(MI->mayLoad() && MI->mayStore());
- bool Changed = false;
-
- if (canAffectGlobalAddrSpace(AddrSpace)) {
- switch (Scope) {
- case SIAtomicScope::SYSTEM:
- // Set SC1 bit to indicate system scope.
- Changed |= enableSC1Bit(MI);
- break;
- case SIAtomicScope::AGENT:
- case SIAtomicScope::WORKGROUP:
- case SIAtomicScope::WAVEFRONT:
- case SIAtomicScope::SINGLETHREAD:
- // RMW atomic operations implicitly bypass the L1 cache and only use SC1
- // to indicate system or agent scope. The SC0 bit is used to indicate if
- // they are return or no-return. Leave SC1 bit unset to indicate agent
- // scope.
- break;
- default:
- llvm_unreachable("Unsupported synchronization scope");
- }
- }
-
- return Changed;
-}
-
-bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
- MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
- bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
- // Only handle load and store, not atomic read-modify-write insructions. The
- // latter use glc to indicate if the atomic returns a result and so must not
- // be used for cache control.
- assert(MI->mayLoad() ^ MI->mayStore());
-
- // Only update load and store, not LLVM IR atomic read-modify-write
- // instructions. The latter are always marked as volatile so cannot sensibly
- // handle it as do not want to pessimize all atomics. Also they do not support
- // the nontemporal attribute.
- assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
-
- bool Changed = false;
-
- if (IsVolatile) {
- // Set SC bits to indicate system scope.
- Changed |= enableSC0Bit(MI);
- Changed |= enableSC1Bit(MI);
-
- // Ensure operation has completed at system scope to cause all volatile
- // operations to be visible outside the program in a global order. Do not
- // request cross address space as only the global address space can be
- // observable outside the program, so no need to cause a waitcnt for LDS
- // address space operations.
- Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
- Position::AFTER, AtomicOrdering::Unordered);
-
- return Changed;
- }
-
- if (IsNonTemporal) {
- Changed |= enableNTBit(MI);
- return Changed;
- }
-
- return Changed;
-}
-
-bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- Position Pos) const {
- if (!InsertCacheInv)
- return false;
-
+bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ bool IsCrossAddrSpaceOrdering,
+ Position Pos) const {
bool Changed = false;
- MachineBasicBlock &MBB = *MI->getParent();
- DebugLoc DL = MI->getDebugLoc();
+ if (ST.hasGFX90AInsts()) {
+ MachineBasicBlock &MBB = *MI->getParent();
+ const DebugLoc &DL = MI->getDebugLoc();
- if (Pos == Position::AFTER)
- ++MI;
+ if (Pos == Position::AFTER)
+ ++MI;
- if (canAffectGlobalAddrSpace(AddrSpace)) {
- switch (Scope) {
- case SIAtomicScope::SYSTEM:
- // Ensures that following loads will not see stale remote VMEM data or
- // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
- // CC will never be stale due to the local memory probes.
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
- // Set SC bits to indicate system scope.
- .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
- // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
- // hardware does not reorder memory operations by the same wave with
- // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
- // remove any cache lines of earlier writes by the same wave and ensures
- // later reads by the same wave will refetch the cache lines.
- Changed = true;
- break;
- case SIAtomicScope::AGENT:
- // Ensures that following loads will not see stale remote date or local
- // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale
- // due to the memory probes.
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
- // Set SC bits to indicate agent scope.
- .addImm(AMDGPU::CPol::SC1);
- // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
- // does not reorder memory operations with respect to preceeding buffer
- // invalidate. The invalidate is guaranteed to remove any cache lines of
- // earlier writes and ensures later writes will refetch the cache lines.
- Changed = true;
- break;
- case SIAtomicScope::WORKGROUP:
- // In threadgroup split mode the waves of a work-group can be executing on
- // different CUs. Therefore need to invalidate the L1 which is per CU.
- // Otherwise in non-threadgroup split mode all waves of a work-group are
- // on the same CU, and so the L1 does not need to be invalidated.
- if (ST.isTgSplitEnabled()) {
- // Ensures L1 is invalidated if in threadgroup split mode. In
- // non-threadgroup split mode it is a NOP, but no point generating it in
- // that case if know not in that mode.
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
- // Set SC bits to indicate work-group scope.
- .addImm(AMDGPU::CPol::SC0);
- // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
- // does not reorder memory operations with respect to preceeding buffer
- // invalidate. The invalidate is guaranteed to remove any cache lines of
- // earlier writes and ensures later writes will refetch the cache lines.
+ if (canAffectGlobalAddrSpace(AddrSpace)) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
+ // hardware does not reorder memory operations by the same wave with
+ // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
+ // to initiate writeback of any dirty cache lines of earlier writes by
+ // the same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
+ // writeback has completed.
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
+ // Set SC bits to indicate system scope.
+ .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
Changed = true;
+ break;
+ case SIAtomicScope::AGENT:
+ if (ST.hasGFX940Insts()) {
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
+ // Set SC bits to indicate agent scope.
+ .addImm(AMDGPU::CPol::SC1);
+
+ // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
+ // SIAtomicScope::AGENT, the following insertWait will generate the
+ // required "S_WAITCNT vmcnt(0)".
+ Changed = true;
+ }
+ break;
+ case SIAtomicScope::WORKGROUP:
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // For GFX940, do not generate "BUFFER_WBL2" as there are no caches it
+ // would writeback, and would require an otherwise unnecessary
+ // "S_WAITCNT vmcnt(0)".
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
}
- break;
- case SIAtomicScope::WAVEFRONT:
- case SIAtomicScope::SINGLETHREAD:
- // Could generate "BUFFER_INV" but it would do nothing as there are no
- // caches to invalidate.
- break;
- default:
- llvm_unreachable("Unsupported synchronization scope");
}
- }
-
- /// The scratch address space does not need the global memory cache
- /// to be flushed as all memory operations by the same thread are
- /// sequentially consistent, and no other thread can access scratch
- /// memory.
-
- /// Other address spaces do not have a cache.
- if (Pos == Position::AFTER)
- --MI;
-
- return Changed;
-}
-
-bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- bool IsCrossAddrSpaceOrdering,
- Position Pos) const {
- bool Changed = false;
-
- MachineBasicBlock &MBB = *MI->getParent();
- DebugLoc DL = MI->getDebugLoc();
-
- if (Pos == Position::AFTER)
- ++MI;
-
- if (canAffectGlobalAddrSpace(AddrSpace)) {
- switch (Scope) {
- case SIAtomicScope::SYSTEM:
- // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
- // hardware does not reorder memory operations by the same wave with
- // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
- // to initiate writeback of any dirty cache lines of earlier writes by the
- // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
- // writeback has completed.
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
- // Set SC bits to indicate system scope.
- .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
- // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
- // SIAtomicScope::SYSTEM, the following insertWait will generate the
- // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".
- Changed = true;
- break;
- case SIAtomicScope::AGENT:
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
- // Set SC bits to indicate agent scope.
- .addImm(AMDGPU::CPol::SC1);
-
- // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
- // SIAtomicScope::AGENT, the following insertWait will generate the
- // required "S_WAITCNT vmcnt(0)".
- Changed = true;
- break;
- case SIAtomicScope::WORKGROUP:
- case SIAtomicScope::WAVEFRONT:
- case SIAtomicScope::SINGLETHREAD:
- // Do not generate "BUFFER_WBL2" as there are no caches it would
- // writeback, and would require an otherwise unnecessary
- // "S_WAITCNT vmcnt(0)".
- break;
- default:
- llvm_unreachable("Unsupported synchronization scope");
- }
+ if (Pos == Position::AFTER)
+ --MI;
}
- if (Pos == Position::AFTER)
- --MI;
-
// Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
// S_WAITCNT needed.
Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
- IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
+ IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
+ /*AtomicsOnly=*/false);
return Changed;
}
bool SIGfx10CacheControl::enableLoadCacheBypass(
- const MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
+ const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace) const {
assert(MI->mayLoad() && !MI->mayStore());
bool Changed = false;
@@ -1922,8 +1479,9 @@ bool SIGfx10CacheControl::enableLoadCacheBypass(
case SIAtomicScope::AGENT:
// Set the L0 and L1 cache policies to MISS_EVICT.
// Note: there is no L2 cache coherent bypass control at the ISA level.
- Changed |= enableGLCBit(MI);
- Changed |= enableDLCBit(MI);
+ // For GFX10, set GLC+DLC, for GFX11, only set GLC.
+ Changed |=
+ enableCPolBits(MI, CPol::GLC | (AMDGPU::isGFX10(ST) ? CPol::DLC : 0));
break;
case SIAtomicScope::WORKGROUP:
// In WGP mode the waves of a work-group can be executing on either CU of
@@ -1931,7 +1489,7 @@ bool SIGfx10CacheControl::enableLoadCacheBypass(
// CU mode all waves of a work-group are on the same CU, and so the L0
// does not need to be bypassed.
if (!ST.isCuModeEnabled())
- Changed |= enableGLCBit(MI);
+ Changed |= enableCPolBits(MI, CPol::GLC);
break;
case SIAtomicScope::WAVEFRONT:
case SIAtomicScope::SINGLETHREAD:
@@ -1959,7 +1517,7 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
// Only handle load and store, not atomic read-modify-write insructions. The
// latter use glc to indicate if the atomic returns a result and so must not
// be used for cache control.
- assert(MI->mayLoad() ^ MI->mayStore());
+ assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
// Only update load and store, not LLVM IR atomic read-modify-write
// instructions. The latter are always marked as volatile so cannot sensibly
@@ -1974,17 +1532,21 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
// and MISS_LRU for store instructions.
// Note: there is no L2 cache coherent bypass control at the ISA level.
if (Op == SIMemOp::LOAD) {
- Changed |= enableGLCBit(MI);
- Changed |= enableDLCBit(MI);
+ Changed |= enableCPolBits(MI, CPol::GLC | CPol::DLC);
}
+ // GFX11: Set MALL NOALLOC for both load and store instructions.
+ if (AMDGPU::isGFX11(ST))
+ Changed |= enableCPolBits(MI, CPol::DLC);
+
// Ensure operation has completed at system scope to cause all volatile
// operations to be visible outside the program in a global order. Do not
// request cross address space as only the global address space can be
// observable outside the program, so no need to cause a waitcnt for LDS
// address space operations.
Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
- Position::AFTER, AtomicOrdering::Unordered);
+ Position::AFTER, AtomicOrdering::Unordered,
+ /*AtomicsOnly=*/false);
return Changed;
}
@@ -1994,8 +1556,12 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
// For stores setting both GLC and SLC configures L0 and L1 cache policy
// to MISS_EVICT and the L2 cache policy to STREAM.
if (Op == SIMemOp::STORE)
- Changed |= enableGLCBit(MI);
- Changed |= enableSLCBit(MI);
+ Changed |= enableCPolBits(MI, CPol::GLC);
+ Changed |= enableCPolBits(MI, CPol::SLC);
+
+ // GFX11: Set MALL NOALLOC for both load and store instructions.
+ if (AMDGPU::isGFX11(ST))
+ Changed |= enableCPolBits(MI, CPol::DLC);
return Changed;
}
@@ -2007,11 +1573,12 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsCrossAddrSpaceOrdering,
- Position Pos, AtomicOrdering Order) const {
+ Position Pos, AtomicOrdering Order,
+ bool AtomicsOnly) const {
bool Changed = false;
MachineBasicBlock &MBB = *MI->getParent();
- DebugLoc DL = MI->getDebugLoc();
+ const DebugLoc &DL = MI->getDebugLoc();
if (Pos == Position::AFTER)
++MI;
@@ -2035,8 +1602,11 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
// the WGP. Therefore need to wait for operations to complete to ensure
// they are visible to waves in the other CU as the L0 is per CU.
// Otherwise in CU mode and all waves of a work-group are on the same CU
- // which shares the same L0.
- if (!ST.isCuModeEnabled()) {
+ // which shares the same L0. Note that we still need to wait when
+ // performing a release in this mode to respect the transitivity of
+ // happens-before, e.g. other waves of the workgroup must be able to
+ // release the memory from another wave at a wider scope.
+ if (!ST.isCuModeEnabled() || isReleaseOrStronger(Order)) {
if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
VMCnt |= true;
if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
@@ -2143,7 +1713,7 @@ bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
bool Changed = false;
MachineBasicBlock &MBB = *MI->getParent();
- DebugLoc DL = MI->getDebugLoc();
+ const DebugLoc &DL = MI->getDebugLoc();
if (Pos == Position::AFTER)
++MI;
@@ -2191,117 +1761,6 @@ bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
return Changed;
}
-bool SIGfx10CacheControl::insertBarrierStart(
- MachineBasicBlock::iterator &MI) const {
- // We need to wait on vm_vsrc so barriers can pair with fences in GFX10+ CU
- // mode. This is because a CU mode release fence does not emit any wait, which
- // is fine when only dealing with vmem, but isn't sufficient in the presence
- // of barriers which do not go through vmem.
- // GFX12.5 does not require this additional wait.
- if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts())
- return false;
-
- BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
- TII->get(AMDGPU::S_WAITCNT_DEPCTR))
- .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
- return true;
-}
-
-bool SIGfx11CacheControl::enableLoadCacheBypass(
- const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace) const {
- assert(MI->mayLoad() && !MI->mayStore());
- bool Changed = false;
-
- if (canAffectGlobalAddrSpace(AddrSpace)) {
- switch (Scope) {
- case SIAtomicScope::SYSTEM:
- case SIAtomicScope::AGENT:
- // Set the L0 and L1 cache policies to MISS_EVICT.
- // Note: there is no L2 cache coherent bypass control at the ISA level.
- Changed |= enableGLCBit(MI);
- break;
- case SIAtomicScope::WORKGROUP:
- // In WGP mode the waves of a work-group can be executing on either CU of
- // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
- // CU mode all waves of a work-group are on the same CU, and so the L0
- // does not need to be bypassed.
- if (!ST.isCuModeEnabled())
- Changed |= enableGLCBit(MI);
- break;
- case SIAtomicScope::WAVEFRONT:
- case SIAtomicScope::SINGLETHREAD:
- // No cache to bypass.
- break;
- default:
- llvm_unreachable("Unsupported synchronization scope");
- }
- }
-
- /// The scratch address space does not need the global memory caches
- /// to be bypassed as all memory operations by the same thread are
- /// sequentially consistent, and no other thread can access scratch
- /// memory.
-
- /// Other address spaces do not have a cache.
-
- return Changed;
-}
-
-bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
- MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
- bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
-
- // Only handle load and store, not atomic read-modify-write insructions. The
- // latter use glc to indicate if the atomic returns a result and so must not
- // be used for cache control.
- assert(MI->mayLoad() ^ MI->mayStore());
-
- // Only update load and store, not LLVM IR atomic read-modify-write
- // instructions. The latter are always marked as volatile so cannot sensibly
- // handle it as do not want to pessimize all atomics. Also they do not support
- // the nontemporal attribute.
- assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
-
- bool Changed = false;
-
- if (IsVolatile) {
- // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
- // and MISS_LRU for store instructions.
- // Note: there is no L2 cache coherent bypass control at the ISA level.
- if (Op == SIMemOp::LOAD)
- Changed |= enableGLCBit(MI);
-
- // Set MALL NOALLOC for load and store instructions.
- Changed |= enableDLCBit(MI);
-
- // Ensure operation has completed at system scope to cause all volatile
- // operations to be visible outside the program in a global order. Do not
- // request cross address space as only the global address space can be
- // observable outside the program, so no need to cause a waitcnt for LDS
- // address space operations.
- Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
- Position::AFTER, AtomicOrdering::Unordered);
- return Changed;
- }
-
- if (IsNonTemporal) {
- // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
- // and L2 cache policy to STREAM.
- // For stores setting both GLC and SLC configures L0 and L1 cache policy
- // to MISS_EVICT and the L2 cache policy to STREAM.
- if (Op == SIMemOp::STORE)
- Changed |= enableGLCBit(MI);
- Changed |= enableSLCBit(MI);
-
- // Set MALL NOALLOC for load and store instructions.
- Changed |= enableDLCBit(MI);
- return Changed;
- }
-
- return Changed;
-}
-
bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI,
AMDGPU::CPol::CPol Value) const {
MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
@@ -2354,11 +1813,12 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsCrossAddrSpaceOrdering,
- Position Pos, AtomicOrdering Order) const {
+ Position Pos, AtomicOrdering Order,
+ bool AtomicsOnly) const {
bool Changed = false;
MachineBasicBlock &MBB = *MI->getParent();
- DebugLoc DL = MI->getDebugLoc();
+ const DebugLoc &DL = MI->getDebugLoc();
bool LOADCnt = false;
bool DSCnt = false;
@@ -2383,15 +1843,20 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
// In WGP mode the waves of a work-group can be executing on either CU
// of the WGP. Therefore need to wait for operations to complete to
// ensure they are visible to waves in the other CU as the L0 is per CU.
+ //
// Otherwise in CU mode and all waves of a work-group are on the same CU
- // which shares the same L0.
+ // which shares the same L0. Note that we still need to wait when
+ // performing a release in this mode to respect the transitivity of
+ // happens-before, e.g. other waves of the workgroup must be able to
+ // release the memory from another wave at a wider scope.
//
// GFX12.5:
// CU$ has two ports. To ensure operations are visible at the workgroup
// level, we need to ensure all operations in this port have completed
// so the other SIMDs in the WG can see them. There is no ordering
// guarantee between the ports.
- if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) {
+ if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts() ||
+ isReleaseOrStronger(Order)) {
if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
LOADCnt |= true;
if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
@@ -2444,7 +1909,7 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
//
// This also applies to fences. Fences cannot pair with an instruction
// tracked with bvh/samplecnt as we don't have any atomics that do that.
- if (Order != AtomicOrdering::Acquire && ST.hasImageInsts()) {
+ if (!AtomicsOnly && ST.hasImageInsts()) {
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0);
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
}
@@ -2476,7 +1941,7 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
return false;
MachineBasicBlock &MBB = *MI->getParent();
- DebugLoc DL = MI->getDebugLoc();
+ const DebugLoc &DL = MI->getDebugLoc();
/// The scratch address space does not need the global memory cache
/// to be flushed as all memory operations by the same thread are
@@ -2527,6 +1992,17 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
if (Pos == Position::AFTER)
--MI;
+ // Target requires a waitcnt to ensure that the proceeding INV has completed
+ // as it may get reorded with following load instructions.
+ if (ST.hasINVWBL2WaitCntRequirement() && Scope > SIAtomicScope::CLUSTER) {
+ insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD,
+ /*IsCrossAddrSpaceOrdering=*/false, Pos, AtomicOrdering::Acquire,
+ /*AtomicsOnly=*/false);
+
+ if (Pos == Position::AFTER)
+ --MI;
+ }
+
return true;
}
@@ -2538,7 +2014,7 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
bool Changed = false;
MachineBasicBlock &MBB = *MI->getParent();
- DebugLoc DL = MI->getDebugLoc();
+ const DebugLoc &DL = MI->getDebugLoc();
// The scratch address space does not need the global memory cache
// writeback as all memory operations by the same thread are
@@ -2554,19 +2030,15 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
//
// Emitting it for lower scopes is a slow no-op, so we omit it
// for performance.
+ std::optional<AMDGPU::CPol::CPol> NeedsWB;
switch (Scope) {
case SIAtomicScope::SYSTEM:
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))
- .addImm(AMDGPU::CPol::SCOPE_SYS);
- Changed = true;
+ NeedsWB = AMDGPU::CPol::SCOPE_SYS;
break;
case SIAtomicScope::AGENT:
// GFX12.5 may have >1 L2 per device so we must emit a device scope WB.
- if (ST.hasGFX1250Insts()) {
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))
- .addImm(AMDGPU::CPol::SCOPE_DEV);
- Changed = true;
- }
+ if (ST.hasGFX1250Insts())
+ NeedsWB = AMDGPU::CPol::SCOPE_DEV;
break;
case SIAtomicScope::CLUSTER:
case SIAtomicScope::WORKGROUP:
@@ -2579,6 +2051,20 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
llvm_unreachable("Unsupported synchronization scope");
}
+ if (NeedsWB) {
+ // Target requires a waitcnt to ensure that the proceeding store
+ // proceeding store/rmw operations have completed in L2 so their data will
+ // be written back by the WB instruction.
+ if (ST.hasINVWBL2WaitCntRequirement())
+ insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
+ /*IsCrossAddrSpaceOrdering=*/false, Pos,
+ AtomicOrdering::Release,
+ /*AtomicsOnly=*/false);
+
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB)).addImm(*NeedsWB);
+ Changed = true;
+ }
+
if (Pos == Position::AFTER)
--MI;
}
@@ -2587,17 +2073,29 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
// complete, whether we inserted a WB or not. If we inserted a WB (storecnt),
// we of course need to wait for that as well.
Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
- IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
+ IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
+ /*AtomicsOnly=*/false);
return Changed;
}
+bool SIGfx12CacheControl::handleNonVolatile(MachineInstr &MI) const {
+ // On GFX12.5, set the NV CPol bit.
+ if (!ST.hasGFX1250Insts())
+ return false;
+ MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
+ if (!CPol)
+ return false;
+ CPol->setImm(CPol->getImm() | AMDGPU::CPol::NV);
+ return true;
+}
+
bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
// Only handle load and store, not atomic read-modify-write instructions.
- assert(MI->mayLoad() ^ MI->mayStore());
+ assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
// Only update load and store, not LLVM IR atomic read-modify-write
// instructions. The latter are always marked as volatile so cannot sensibly
@@ -2618,13 +2116,21 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
if (IsVolatile) {
Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
+ if (ST.requiresWaitXCntForSingleAccessInstructions() &&
+ SIInstrInfo::isVMEM(*MI)) {
+ MachineBasicBlock &MBB = *MI->getParent();
+ BuildMI(MBB, MI, MI->getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0);
+ Changed = true;
+ }
+
// Ensure operation has completed at system scope to cause all volatile
// operations to be visible outside the program in a global order. Do not
// request cross address space as only the global address space can be
// observable outside the program, so no need to cause a waitcnt for LDS
// address space operations.
Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
- Position::AFTER, AtomicOrdering::Unordered);
+ Position::AFTER, AtomicOrdering::Unordered,
+ /*AtomicsOnly=*/false);
}
return Changed;
@@ -2635,9 +2141,8 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
const bool IsRMW = (MI.mayLoad() && MI.mayStore());
bool Changed = false;
- // GFX12.5 only: xcnt wait is needed before flat and global atomics
- // stores/rmw.
- if (Atomic && ST.requiresWaitXCntBeforeAtomicStores() && TII->isFLAT(MI)) {
+ if (Atomic && ST.requiresWaitXCntForSingleAccessInstructions() &&
+ SIInstrInfo::isVMEM(MI)) {
MachineBasicBlock &MBB = *MI.getParent();
BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0);
Changed = true;
@@ -2653,7 +2158,8 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
const unsigned Scope = CPol->getImm() & CPol::SCOPE;
// GFX12.0 only: Extra waits needed before system scope stores.
- if (!ST.hasGFX1250Insts() && !Atomic && Scope == CPol::SCOPE_SYS)
+ if (ST.requiresWaitsBeforeSystemScopeStores() && !Atomic &&
+ Scope == CPol::SCOPE_SYS)
Changed |= insertWaitsBeforeSystemScopeStore(MI.getIterator());
return Changed;
@@ -2748,13 +2254,15 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(),
SIMemOp::LOAD | SIMemOp::STORE,
MOI.getIsCrossAddressSpaceOrdering(),
- Position::BEFORE, Order);
+ Position::BEFORE, Order, /*AtomicsOnly=*/false);
if (Order == AtomicOrdering::Acquire ||
Order == AtomicOrdering::SequentiallyConsistent) {
- Changed |= CC->insertWait(
- MI, MOI.getScope(), MOI.getInstrAddrSpace(), SIMemOp::LOAD,
- MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER, Order);
+ // The wait below only needs to wait on the prior atomic.
+ Changed |=
+ CC->insertWait(MI, MOI.getScope(), MOI.getInstrAddrSpace(),
+ SIMemOp::LOAD, MOI.getIsCrossAddressSpaceOrdering(),
+ Position::AFTER, Order, /*AtomicsOnly=*/true);
Changed |= CC->insertAcquire(MI, MOI.getScope(),
MOI.getOrderingAddrSpace(),
Position::AFTER);
@@ -2830,9 +2338,11 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
if (MOI.isAtomic()) {
const AtomicOrdering Order = MOI.getOrdering();
if (Order == AtomicOrdering::Acquire) {
- Changed |= CC->insertWait(
- MI, MOI.getScope(), OrderingAddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
- MOI.getIsCrossAddressSpaceOrdering(), Position::BEFORE, Order);
+ // Acquire fences only need to wait on the previous atomic they pair with.
+ Changed |= CC->insertWait(MI, MOI.getScope(), OrderingAddrSpace,
+ SIMemOp::LOAD | SIMemOp::STORE,
+ MOI.getIsCrossAddressSpaceOrdering(),
+ Position::BEFORE, Order, /*AtomicsOnly=*/true);
}
if (Order == AtomicOrdering::Release ||
@@ -2897,10 +2407,12 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
Order == AtomicOrdering::SequentiallyConsistent ||
MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
- Changed |= CC->insertWait(
- MI, MOI.getScope(), MOI.getInstrAddrSpace(),
- isAtomicRet(*MI) ? SIMemOp::LOAD : SIMemOp::STORE,
- MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER, Order);
+ // Only wait on the previous atomic.
+ Changed |=
+ CC->insertWait(MI, MOI.getScope(), MOI.getInstrAddrSpace(),
+ isAtomicRet(*MI) ? SIMemOp::LOAD : SIMemOp::STORE,
+ MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER,
+ Order, /*AtomicsOnly=*/true);
Changed |= CC->insertAcquire(MI, MOI.getScope(),
MOI.getOrderingAddrSpace(),
Position::AFTER);
@@ -2913,6 +2425,23 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
return Changed;
}
+bool SIMemoryLegalizer::expandLDSDMA(const SIMemOpInfo &MOI,
+ MachineBasicBlock::iterator &MI) {
+ assert(MI->mayLoad() && MI->mayStore());
+
+ // The volatility or nontemporal-ness of the operation is a
+ // function of the global memory, not the LDS.
+ SIMemOp OpKind =
+ SIInstrInfo::mayWriteLDSThroughDMA(*MI) ? SIMemOp::LOAD : SIMemOp::STORE;
+
+ // Handle volatile and/or nontemporal markers on direct-to-LDS loads and
+ // stores. The operation is treated as a volatile/nontemporal store
+ // to its second argument.
+ return CC->enableVolatileAndOrNonTemporal(
+ MI, MOI.getInstrAddrSpace(), OpKind, MOI.isVolatile(),
+ MOI.isNonTemporal(), MOI.isLastUse());
+}
+
bool SIMemoryLegalizerLegacy::runOnMachineFunction(MachineFunction &MF) {
const MachineModuleInfo &MMI =
getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
@@ -2956,22 +2485,21 @@ bool SIMemoryLegalizer::run(MachineFunction &MF) {
MI = II->getIterator();
}
- if (ST.getInstrInfo()->isBarrierStart(MI->getOpcode())) {
- Changed |= CC->insertBarrierStart(MI);
- continue;
+ if (MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic) {
+ if (const auto &MOI = MOA.getLoadInfo(MI))
+ Changed |= expandLoad(*MOI, MI);
+ else if (const auto &MOI = MOA.getStoreInfo(MI))
+ Changed |= expandStore(*MOI, MI);
+ else if (const auto &MOI = MOA.getLDSDMAInfo(MI))
+ Changed |= expandLDSDMA(*MOI, MI);
+ else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
+ Changed |= expandAtomicFence(*MOI, MI);
+ else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
+ Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
}
- if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
- continue;
-
- if (const auto &MOI = MOA.getLoadInfo(MI))
- Changed |= expandLoad(*MOI, MI);
- else if (const auto &MOI = MOA.getStoreInfo(MI)) {
- Changed |= expandStore(*MOI, MI);
- } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
- Changed |= expandAtomicFence(*MOI, MI);
- else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
- Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
+ if (isNonVolatileMemoryAccess(*MI))
+ Changed |= CC->handleNonVolatile(*MI);
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp b/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp
index f9efee6..9a58382 100644
--- a/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp
+++ b/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp
@@ -28,19 +28,9 @@ SIModeRegisterDefaults::SIModeRegisterDefaults(const Function &F,
DX10Clamp = DX10ClampAttr == "true";
}
- StringRef DenormF32Attr =
- F.getFnAttribute("denormal-fp-math-f32").getValueAsString();
- if (!DenormF32Attr.empty())
- FP32Denormals = parseDenormalFPAttribute(DenormF32Attr);
-
- StringRef DenormAttr =
- F.getFnAttribute("denormal-fp-math").getValueAsString();
- if (!DenormAttr.empty()) {
- DenormalMode DenormMode = parseDenormalFPAttribute(DenormAttr);
- if (DenormF32Attr.empty())
- FP32Denormals = DenormMode;
- FP64FP16Denormals = DenormMode;
- }
+ DenormalFPEnv FPEnv = F.getDenormalFPEnv();
+ FP64FP16Denormals = FPEnv.DefaultMode;
+ FP32Denormals = FPEnv.F32Mode;
}
using namespace AMDGPU;
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
index aa028c8..47bc218 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
@@ -76,9 +76,7 @@ class SIOptimizeExecMaskingLegacy : public MachineFunctionPass {
public:
static char ID;
- SIOptimizeExecMaskingLegacy() : MachineFunctionPass(ID) {
- initializeSIOptimizeExecMaskingLegacyPass(*PassRegistry::getPassRegistry());
- }
+ SIOptimizeExecMaskingLegacy() : MachineFunctionPass(ID) {}
bool runOnMachineFunction(MachineFunction &MF) override;
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
index c186f5a..ac24f2f 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
@@ -54,10 +54,7 @@ class SIOptimizeExecMaskingPreRALegacy : public MachineFunctionPass {
public:
static char ID;
- SIOptimizeExecMaskingPreRALegacy() : MachineFunctionPass(ID) {
- initializeSIOptimizeExecMaskingPreRALegacyPass(
- *PassRegistry::getPassRegistry());
- }
+ SIOptimizeExecMaskingPreRALegacy() : MachineFunctionPass(ID) {}
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -473,6 +470,8 @@ bool SIOptimizeExecMaskingPreRA::run(MachineFunction &MF) {
assert(Idx != -1);
if (SingleExecUser->getParent() == I->getParent() &&
!SingleExecUser->getOperand(Idx).isImplicit() &&
+ static_cast<unsigned>(Idx) <
+ SingleExecUser->getDesc().getNumOperands() &&
TII->isOperandLegal(*SingleExecUser, Idx, &I->getOperand(1))) {
LLVM_DEBUG(dbgs() << "Redundant EXEC COPY: " << *I << '\n');
LIS->RemoveMachineInstrFromMaps(*I);
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
index 41402bd..610a835 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
@@ -81,6 +81,7 @@
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/IR/Dominators.h"
+#include "llvm/InitializePasses.h"
using namespace llvm;
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index bfac639..926c52f 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -118,7 +118,7 @@ public:
MachineInstr *getParentInst() const { return Target->getParent(); }
MachineRegisterInfo *getMRI() const {
- return &getParentInst()->getParent()->getParent()->getRegInfo();
+ return &getParentInst()->getMF()->getRegInfo();
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -1284,7 +1284,7 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
// Clone the instruction to allow revoking changes
// made to MI during the processing of the operands
// if the conversion fails.
- SDWAInst = MI.getParent()->getParent()->CloneMachineInstr(&MI);
+ SDWAInst = MI.getMF()->CloneMachineInstr(&MI);
MI.getParent()->insert(MI.getIterator(), SDWAInst);
} else {
SDWAInst = createSDWAVersion(MI);
@@ -1334,20 +1334,21 @@ void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI,
const MCInstrDesc &Desc = TII->get(MI.getOpcode());
unsigned ConstantBusCount = 0;
for (MachineOperand &Op : MI.explicit_uses()) {
- if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg())))
- continue;
-
- unsigned I = Op.getOperandNo();
+ if (Op.isReg()) {
+ if (TRI->isVGPR(*MRI, Op.getReg()))
+ continue;
- int16_t RegClass = TII->getOpRegClassID(Desc.operands()[I]);
- if (RegClass == -1 || !TRI->isVSSuperClass(TRI->getRegClass(RegClass)))
+ if (ST.hasSDWAScalar() && ConstantBusCount == 0) {
+ ++ConstantBusCount;
+ continue;
+ }
+ } else if (!Op.isImm())
continue;
- if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() &&
- TRI->isSGPRReg(*MRI, Op.getReg())) {
- ++ConstantBusCount;
+ unsigned I = Op.getOperandNo();
+ const TargetRegisterClass *OpRC = TII->getRegClass(Desc, I);
+ if (!OpRC || !TRI->isVSSuperClass(OpRC))
continue;
- }
Register VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
@@ -1355,8 +1356,7 @@ void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI,
if (Op.isImm())
Copy.addImm(Op.getImm());
else if (Op.isReg())
- Copy.addReg(Op.getReg(), Op.isKill() ? RegState::Kill : 0,
- Op.getSubReg());
+ Copy.addReg(Op.getReg(), getKillRegState(Op.isKill()), Op.getSubReg());
Op.ChangeToRegister(VGPR, false);
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
index 5720b97..787f7b3 100644
--- a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
@@ -29,9 +29,7 @@ public:
static char ID;
public:
- SIPostRABundlerLegacy() : MachineFunctionPass(ID) {
- initializeSIPostRABundlerLegacyPass(*PassRegistry::getPassRegistry());
- }
+ SIPostRABundlerLegacy() : MachineFunctionPass(ID) {}
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -110,7 +108,7 @@ void SIPostRABundler::collectUsedRegUnits(const MachineInstr &MI,
"subregister indexes should not be present after RA");
for (MCRegUnit Unit : TRI->regunits(Reg))
- UsedRegUnits.set(Unit);
+ UsedRegUnits.set(static_cast<unsigned>(Unit));
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
index ecfaa5c..b9f2993 100644
--- a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
@@ -153,11 +153,13 @@ void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) {
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
for (unsigned Reg : RegsToRewrite) {
- LIS->removeInterval(Reg);
-
const Register PhysReg = VRM->getPhys(Reg);
assert(PhysReg != 0);
+ LiveInterval &LI = LIS->getInterval(Reg);
+ Matrix->unassign(LI, /*ClearAllReferencingSegments=*/true);
+ LIS->removeInterval(Reg);
+
MFI->reserveWWMRegister(PhysReg);
}
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index 01a40c1..73aab4e 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -22,10 +22,11 @@
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
#include "llvm/CodeGen/TargetSchedule.h"
#include "llvm/Support/BranchProbability.h"
-
using namespace llvm;
#define DEBUG_TYPE "si-pre-emit-peephole"
@@ -47,9 +48,6 @@ private:
const MachineBasicBlock &From,
const MachineBasicBlock &To) const;
bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
- // Check if the machine instruction being processed is a supported packed
- // instruction.
- bool isUnpackingSupportedInstr(MachineInstr &MI) const;
// Creates a list of packed instructions following an MFMA that are suitable
// for unpacking.
void collectUnpackingCandidates(MachineInstr &BeginMI,
@@ -68,11 +66,11 @@ private:
// this transformation.
void performF32Unpacking(MachineInstr &I);
// Select corresponding unpacked instruction
- uint16_t mapToUnpackedOpcode(MachineInstr &I);
+ uint32_t mapToUnpackedOpcode(MachineInstr &I);
// Creates the unpacked instruction to be inserted. Adds source modifiers to
// the unpacked instructions based on the source modifiers in the packed
// instruction.
- MachineInstrBuilder createUnpackedMI(MachineInstr &I, uint16_t UnpackedOpcode,
+ MachineInstrBuilder createUnpackedMI(MachineInstr &I, uint32_t UnpackedOpcode,
bool IsHiBits);
// Process operands/source modifiers from packed instructions and insert the
// appropriate source modifers and operands into the unpacked instructions.
@@ -87,9 +85,7 @@ class SIPreEmitPeepholeLegacy : public MachineFunctionPass {
public:
static char ID;
- SIPreEmitPeepholeLegacy() : MachineFunctionPass(ID) {
- initializeSIPreEmitPeepholeLegacyPass(*PassRegistry::getPassRegistry());
- }
+ SIPreEmitPeepholeLegacy() : MachineFunctionPass(ID) {}
bool runOnMachineFunction(MachineFunction &MF) override {
return SIPreEmitPeephole().run(MF);
@@ -156,11 +152,12 @@ bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
MachineOperand &Op1 = A->getOperand(1);
MachineOperand &Op2 = A->getOperand(2);
- if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) {
+ if ((!Op1.isReg() || Op1.getReg() != ExecReg) && Op2.isReg() &&
+ Op2.getReg() == ExecReg) {
TII->commuteInstruction(*A);
Changed = true;
}
- if (Op1.getReg() != ExecReg)
+ if (!Op1.isReg() || Op1.getReg() != ExecReg)
return Changed;
if (Op2.isImm() && !(Op2.getImm() == -1 || Op2.getImm() == 0))
return Changed;
@@ -299,7 +296,7 @@ bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
for (MachineBasicBlock::instr_iterator I = std::next(First.getIterator()),
E = MI.getIterator();
I != E; ++I) {
- if (I->isBundle())
+ if (I->isBundle() || I->isDebugInstr())
continue;
switch (I->getOpcode()) {
case AMDGPU::S_SET_GPR_IDX_MODE:
@@ -454,23 +451,6 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
return true;
}
-// If support is extended to new operations, add tests in
-// llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir.
-bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const {
- if (!TII->isNeverCoissue(MI))
- return false;
- unsigned Opcode = MI.getOpcode();
- switch (Opcode) {
- case AMDGPU::V_PK_ADD_F32:
- case AMDGPU::V_PK_MUL_F32:
- case AMDGPU::V_PK_FMA_F32:
- return true;
- default:
- return false;
- }
- llvm_unreachable("Fully covered switch");
-}
-
bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) {
unsigned OpCode = MI.getOpcode();
Register DstReg = MI.getOperand(0).getReg();
@@ -528,7 +508,7 @@ bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) {
return false;
}
-uint16_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) {
+uint32_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) {
unsigned Opcode = I.getOpcode();
// Use 64 bit encoding to allow use of VOP3 instructions.
// VOP3 e64 instructions allow source modifiers
@@ -541,7 +521,7 @@ uint16_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) {
case AMDGPU::V_PK_FMA_F32:
return AMDGPU::V_FMA_F32_e64;
default:
- return std::numeric_limits<uint16_t>::max();
+ return std::numeric_limits<uint32_t>::max();
}
llvm_unreachable("Fully covered switch");
}
@@ -612,10 +592,13 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) {
MachineInstr &Instr = *I;
+ uint32_t UnpackedOpCode = mapToUnpackedOpcode(Instr);
+ bool IsUnpackable =
+ !(UnpackedOpCode == std::numeric_limits<uint32_t>::max());
if (Instr.isMetaInstruction())
continue;
if ((Instr.isTerminator()) ||
- (TII->isNeverCoissue(Instr) && !isUnpackingSupportedInstr(Instr)) ||
+ (TII->isNeverCoissue(Instr) && !IsUnpackable) ||
(SIInstrInfo::modifiesModeRegister(Instr) &&
Instr.modifiesRegister(AMDGPU::EXEC, TRI)))
return;
@@ -639,7 +622,7 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
if (TRI->regsOverlap(MFMADef, InstrMO.getReg()))
return;
}
- if (!isUnpackingSupportedInstr(Instr))
+ if (!IsUnpackable)
continue;
if (canUnpackingClobberRegister(Instr))
@@ -657,10 +640,10 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
}
void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) {
- MachineOperand DstOp = I.getOperand(0);
+ const MachineOperand &DstOp = I.getOperand(0);
- uint16_t UnpackedOpcode = mapToUnpackedOpcode(I);
- assert(UnpackedOpcode != std::numeric_limits<uint16_t>::max() &&
+ uint32_t UnpackedOpcode = mapToUnpackedOpcode(I);
+ assert(UnpackedOpcode != std::numeric_limits<uint32_t>::max() &&
"Unsupported Opcode");
MachineInstrBuilder Op0LOp1L =
@@ -683,12 +666,12 @@ void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) {
}
MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
- uint16_t UnpackedOpcode,
+ uint32_t UnpackedOpcode,
bool IsHiBits) {
MachineBasicBlock &MBB = *I.getParent();
const DebugLoc &DL = I.getDebugLoc();
- const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src0);
- const MachineOperand *SrcMO2 = TII->getNamedOperand(I, AMDGPU::OpName::src1);
+ const MachineOperand *SrcMO0 = TII->getNamedOperand(I, AMDGPU::OpName::src0);
+ const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src1);
Register DstReg = I.getOperand(0).getReg();
unsigned OpCode = I.getOpcode();
Register UnpackedDstReg = IsHiBits ? TRI->getSubReg(DstReg, AMDGPU::sub1)
@@ -702,15 +685,15 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode));
NewMI.addDef(UnpackedDstReg); // vdst
- addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO1);
- addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO2);
+ addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO0);
+ addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO1);
if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) {
- const MachineOperand *SrcMO3 =
+ const MachineOperand *SrcMO2 =
TII->getNamedOperand(I, AMDGPU::OpName::src2);
unsigned Src2Mods =
TII->getNamedOperand(I, AMDGPU::OpName::src2_modifiers)->getImm();
- addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO3);
+ addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO2);
}
NewMI.addImm(ClampVal); // clamp
// Packed instructions do not support output modifiers. safe to assign them 0
@@ -722,10 +705,17 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
PreservedAnalyses
llvm::SIPreEmitPeepholePass::run(MachineFunction &MF,
MachineFunctionAnalysisManager &MFAM) {
- if (!SIPreEmitPeephole().run(MF))
- return PreservedAnalyses::all();
+ auto *MDT = MFAM.getCachedResult<MachineDominatorTreeAnalysis>(MF);
+ auto *MPDT = MFAM.getCachedResult<MachinePostDominatorTreeAnalysis>(MF);
+
+ if (SIPreEmitPeephole().run(MF))
+ return getMachineFunctionPassPreservedAnalyses();
- return getMachineFunctionPassPreservedAnalyses();
+ if (MDT)
+ MDT->updateBlockNumbers();
+ if (MPDT)
+ MPDT->updateBlockNumbers();
+ return PreservedAnalyses::all();
}
bool SIPreEmitPeephole::run(MachineFunction &MF) {
@@ -787,9 +777,13 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) {
// TODO: Fold this into previous block, if possible. Evaluate and handle any
// side effects.
+
+ // Perform the extra MF scans only for supported archs
+ if (!ST.hasGFX940Insts())
+ return Changed;
for (MachineBasicBlock &MBB : MF) {
- // Unpack packed instructions overlapped by MFMAs. This allows the compiler
- // to co-issue unpacked instructions with MFMA
+ // Unpack packed instructions overlapped by MFMAs. This allows the
+ // compiler to co-issue unpacked instructions with MFMA
auto SchedModel = TII->getSchedModel();
SetVector<MachineInstr *> InstrsToUnpack;
for (auto &MI : make_early_inc_range(MBB.instrs())) {
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index ebd2e7e..ee46157 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -340,10 +340,12 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST)
"getNumCoveredRegs() will not work with generated subreg masks!");
RegPressureIgnoredUnits.resize(getNumRegUnits());
- RegPressureIgnoredUnits.set(*regunits(MCRegister::from(AMDGPU::M0)).begin());
+ RegPressureIgnoredUnits.set(
+ static_cast<unsigned>(*regunits(MCRegister::from(AMDGPU::M0)).begin()));
for (auto Reg : AMDGPU::VGPR_16RegClass) {
if (AMDGPU::isHi16Reg(Reg, *this))
- RegPressureIgnoredUnits.set(*regunits(Reg).begin());
+ RegPressureIgnoredUnits.set(
+ static_cast<unsigned>(*regunits(Reg).begin()));
}
// HACK: Until this is fully tablegen'd.
@@ -864,7 +866,8 @@ bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
[[fallthrough]];
}
case AMDGPU::V_ADD_U32_e64:
- // FIXME: This optimization is barely profitable enableFlatScratch as-is.
+ // FIXME: This optimization is barely profitable hasFlatScratchEnabled
+ // as-is.
//
// Much of the benefit with the MUBUF handling is we avoid duplicating the
// shift of the frame register, which isn't needed with scratch.
@@ -872,7 +875,7 @@ bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
// materializeFrameBaseRegister doesn't know the register classes of the
// uses, and unconditionally uses an s_add_i32, which will end up using a
// copy for the vector uses.
- return !ST.enableFlatScratch();
+ return !ST.hasFlatScratchEnabled();
case AMDGPU::V_ADD_CO_U32_e32:
if (ST.getConstantBusLimit(AMDGPU::V_ADD_CO_U32_e32) < 2 &&
!isFIPlusImmOrVGPR(*this, *MI))
@@ -912,12 +915,12 @@ Register SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
MachineFunction *MF = MBB->getParent();
const SIInstrInfo *TII = ST.getInstrInfo();
MachineRegisterInfo &MRI = MF->getRegInfo();
- unsigned MovOpc = ST.enableFlatScratch() ? AMDGPU::S_MOV_B32
- : AMDGPU::V_MOV_B32_e32;
+ unsigned MovOpc =
+ ST.hasFlatScratchEnabled() ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
Register BaseReg = MRI.createVirtualRegister(
- ST.enableFlatScratch() ? &AMDGPU::SReg_32_XEXEC_HIRegClass
- : &AMDGPU::VGPR_32RegClass);
+ ST.hasFlatScratchEnabled() ? &AMDGPU::SReg_32_XEXEC_HIRegClass
+ : &AMDGPU::VGPR_32RegClass);
if (Offset == 0) {
BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg)
@@ -927,16 +930,16 @@ Register SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
- Register FIReg = MRI.createVirtualRegister(
- ST.enableFlatScratch() ? &AMDGPU::SReg_32_XM0RegClass
- : &AMDGPU::VGPR_32RegClass);
+ Register FIReg = MRI.createVirtualRegister(ST.hasFlatScratchEnabled()
+ ? &AMDGPU::SReg_32_XM0RegClass
+ : &AMDGPU::VGPR_32RegClass);
BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
.addImm(Offset);
BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg)
.addFrameIndex(FrameIdx);
- if (ST.enableFlatScratch() ) {
+ if (ST.hasFlatScratchEnabled()) {
// FIXME: Make sure scc isn't live in.
BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_I32), BaseReg)
.addReg(OffsetReg, RegState::Kill)
@@ -989,9 +992,9 @@ void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
MachineRegisterInfo &MRI = MF->getRegInfo();
// FIXME: materializeFrameBaseRegister does not know the register class of
- // the uses of the frame index, and assumes SGPR for enableFlatScratch. Emit
- // a copy so we have a legal operand and hope the register coalescer can
- // clean it up.
+ // the uses of the frame index, and assumes SGPR for hasFlatScratchEnabled.
+ // Emit a copy so we have a legal operand and hope the register coalescer
+ // can clean it up.
if (isSGPRReg(MRI, BaseReg)) {
Register BaseRegVGPR =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
@@ -1407,7 +1410,7 @@ static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST,
unsigned Dst = IsStore ? Reg : ValueReg;
unsigned Src = IsStore ? ValueReg : Reg;
bool IsVGPR = TRI->isVGPR(MRI, Reg);
- DebugLoc DL = MI->getDebugLoc();
+ const DebugLoc &DL = MI->getDebugLoc();
if (IsVGPR == TRI->isVGPR(MRI, ValueReg)) {
// Spiller during regalloc may restore a spilled register to its superclass.
// It could result in AGPR spills restored to VGPRs or the other way around,
@@ -1546,7 +1549,10 @@ void SIRegisterInfo::buildSpillLoadStore(
int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
int64_t MaterializedOffset = Offset;
- int64_t MaxOffset = Offset + Size + RemSize - EltSize;
+ // Maxoffset is the starting offset for the last chunk to be spilled.
+ // In case of non-zero remainder element, max offset will be the
+ // last address(offset + Size) after spilling all the EltSize chunks.
+ int64_t MaxOffset = Offset + Size - (RemSize ? 0 : EltSize);
int64_t ScratchOffsetRegDelta = 0;
if (IsFlat && EltSize > 4) {
@@ -1730,8 +1736,8 @@ void SIRegisterInfo::buildSpillLoadStore(
: Register(getSubReg(ValueReg,
getSubRegFromChannel(RegOffset / 4, NumRegs)));
- unsigned SOffsetRegState = 0;
- unsigned SrcDstRegState = getDefRegState(!IsStore);
+ RegState SOffsetRegState = {};
+ RegState SrcDstRegState = getDefRegState(!IsStore);
const bool IsLastSubReg = i + 1 == e;
const bool IsFirstSubReg = i == 0;
if (IsLastSubReg) {
@@ -1771,7 +1777,7 @@ void SIRegisterInfo::buildSpillLoadStore(
}
if ((IsSubReg || NeedSuperRegImpOperand) && (IsFirstSubReg || IsLastSubReg)) {
NeedSuperRegImpOperand = true;
- unsigned State = SrcDstRegState;
+ RegState State = SrcDstRegState;
if (!IsLastSubReg || (Lane != LaneE))
State &= ~RegState::Kill;
if (!IsFirstSubReg || (Lane != LaneS))
@@ -1823,10 +1829,22 @@ void SIRegisterInfo::buildSpillLoadStore(
}
}
+ Register FinalValueReg = ValueReg;
+ if (LoadStoreOp == AMDGPU::SCRATCH_LOAD_USHORT_SADDR) {
+ // If we are loading 16-bit value with SRAMECC endabled we need a temp
+ // 32-bit VGPR to load and extract 16-bits into the final register.
+ ValueReg =
+ RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
+ SubReg = ValueReg;
+ IsKill = false;
+ }
+
+ // Create the MMO, additional set the NonVolatile flag as scratch memory
+ // used for spills will not be used outside the thread.
MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RegOffset);
- MachineMemOperand *NewMMO =
- MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize,
- commonAlignment(Alignment, RegOffset));
+ MachineMemOperand *NewMMO = MF->getMachineMemOperand(
+ PInfo, MMO->getFlags() | MOThreadPrivate, RemEltSize,
+ commonAlignment(Alignment, RegOffset));
auto MIB =
BuildMI(MBB, MI, DL, *Desc)
@@ -1863,6 +1881,17 @@ void SIRegisterInfo::buildSpillLoadStore(
MIB.addImm(0); // swz
MIB.addMemOperand(NewMMO);
+ if (FinalValueReg != ValueReg) {
+ // Extract 16-bit from the loaded 32-bit value.
+ ValueReg = getSubReg(ValueReg, AMDGPU::lo16);
+ MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B16_t16_e64))
+ .addReg(FinalValueReg, getDefRegState(true))
+ .addImm(0)
+ .addReg(ValueReg, getKillRegState(true))
+ .addImm(0);
+ ValueReg = FinalValueReg;
+ }
+
if (!IsAGPR && NeedSuperRegDef)
MIB.addReg(ValueReg, RegState::ImplicitDefine);
@@ -1873,10 +1902,14 @@ void SIRegisterInfo::buildSpillLoadStore(
MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
}
- bool IsSrcDstDef = SrcDstRegState & RegState::Define;
+ bool IsSrcDstDef = hasRegState(SrcDstRegState, RegState::Define);
+ bool PartialReloadCopy = (RemEltSize != EltSize) && !IsStore;
if (NeedSuperRegImpOperand &&
- (IsFirstSubReg || (IsLastSubReg && !IsSrcDstDef)))
+ (IsFirstSubReg || (IsLastSubReg && !IsSrcDstDef))) {
MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
+ if (PartialReloadCopy)
+ MIB.addReg(ValueReg, RegState::Implicit);
+ }
// The epilog restore of a wwm-scratch register can cause undesired
// optimization during machine-cp post PrologEpilogInserter if the same
@@ -1924,7 +1957,7 @@ void SIRegisterInfo::buildSpillLoadStore(
void SIRegisterInfo::addImplicitUsesForBlockCSRLoad(MachineInstrBuilder &MIB,
Register BlockReg) const {
- const MachineFunction *MF = MIB->getParent()->getParent();
+ const MachineFunction *MF = MIB->getMF();
const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(BlockReg);
Register BaseVGPR = getSubReg(BlockReg, AMDGPU::sub0);
@@ -1953,13 +1986,15 @@ void SIRegisterInfo::buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index,
SB.EltSize, Alignment);
if (IsLoad) {
- unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
- : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
+ unsigned Opc = ST.hasFlatScratchEnabled()
+ ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
+ : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, false,
FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS);
} else {
- unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
- : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
+ unsigned Opc = ST.hasFlatScratchEnabled()
+ ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
+ : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, IsKill,
FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS);
// This only ever adds one VGPR spill
@@ -2039,13 +2074,13 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index,
SB.prepare();
// SubReg carries the "Kill" flag when SubReg == SB.SuperReg.
- unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
+ RegState SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
// Per VGPR helper data
auto PVD = SB.getPerVGPRData();
for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
- unsigned TmpVGPRFlags = RegState::Undef;
+ RegState TmpVGPRFlags = RegState::Undef;
// Write sub registers into the VGPR
for (unsigned i = Offset * PVD.PerVGPR,
@@ -2062,7 +2097,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index,
.addReg(SubReg, SubKillState)
.addImm(i % PVD.PerVGPR)
.addReg(SB.TmpVGPR, TmpVGPRFlags);
- TmpVGPRFlags = 0;
+ TmpVGPRFlags = {};
if (Indexes) {
if (i == 0)
@@ -2075,7 +2110,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index,
// TODO: Can we detect this and skip the spill?
if (SB.NumSubRegs > 1) {
// The last implicit use of the SB.SuperReg carries the "Kill" flag.
- unsigned SuperKillState = 0;
+ RegState SuperKillState = {};
if (i + 1 == SB.NumSubRegs)
SuperKillState |= getKillRegState(SB.IsKill);
WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState);
@@ -2185,10 +2220,10 @@ bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI,
RS);
SB.prepare();
// Generate the spill of SGPR to SB.TmpVGPR.
- unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
+ RegState SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
auto PVD = SB.getPerVGPRData();
for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
- unsigned TmpVGPRFlags = RegState::Undef;
+ RegState TmpVGPRFlags = RegState::Undef;
// Write sub registers into the VGPR
for (unsigned i = Offset * PVD.PerVGPR,
e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
@@ -2204,12 +2239,12 @@ bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI,
.addReg(SubReg, SubKillState)
.addImm(i % PVD.PerVGPR)
.addReg(SB.TmpVGPR, TmpVGPRFlags);
- TmpVGPRFlags = 0;
+ TmpVGPRFlags = {};
// There could be undef components of a spilled super register.
// TODO: Can we detect this and skip the spill?
if (SB.NumSubRegs > 1) {
// The last implicit use of the SB.SuperReg carries the "Kill" flag.
- unsigned SuperKillState = 0;
+ RegState SuperKillState = {};
if (i + 1 == SB.NumSubRegs)
SuperKillState |= getKillRegState(SB.IsKill);
WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState);
@@ -2294,7 +2329,7 @@ bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
int SPAdj, unsigned FIOperandNum,
RegScavenger *RS) const {
- MachineFunction *MF = MI->getParent()->getParent();
+ MachineFunction *MF = MI->getMF();
MachineBasicBlock *MBB = MI->getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
MachineFrameInfo &FrameInfo = MF->getFrameInfo();
@@ -2415,13 +2450,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
unsigned Opc;
if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_SAVE) {
- assert(ST.enableFlatScratch() && "Flat Scratch is not enabled!");
+ assert(ST.hasFlatScratchEnabled() && "Flat Scratch is not enabled!");
Opc = AMDGPU::SCRATCH_STORE_SHORT_SADDR_t16;
} else {
Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_SAVE
? AMDGPU::SCRATCH_STORE_BLOCK_SADDR
- : ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
- : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
+ : ST.hasFlatScratchEnabled() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
+ : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
}
auto *MBB = MI->getParent();
@@ -2500,13 +2535,15 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
unsigned Opc;
if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_RESTORE) {
- assert(ST.enableFlatScratch() && "Flat Scratch is not enabled!");
- Opc = AMDGPU::SCRATCH_LOAD_SHORT_D16_SADDR_t16;
+ assert(ST.hasFlatScratchEnabled() && "Flat Scratch is not enabled!");
+ Opc = ST.d16PreservesUnusedBits()
+ ? AMDGPU::SCRATCH_LOAD_SHORT_D16_SADDR_t16
+ : AMDGPU::SCRATCH_LOAD_USHORT_SADDR;
} else {
Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE
? AMDGPU::SCRATCH_LOAD_BLOCK_SADDR
- : ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
- : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
+ : ST.hasFlatScratchEnabled() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
+ : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
}
auto *MBB = MI->getParent();
@@ -2585,7 +2622,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
Offset = 0;
}
- if (FrameReg && !ST.enableFlatScratch()) {
+ if (FrameReg && !ST.hasFlatScratchEnabled()) {
// We should just do an in-place update of the result register. However,
// the value there may also be used by the add, in which case we need a
// temporary register.
@@ -2606,7 +2643,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
}
if ((!OtherOp->isImm() || OtherOp->getImm() != 0) && MaterializedReg) {
- if (ST.enableFlatScratch() &&
+ if (ST.hasFlatScratchEnabled() &&
!TII->isOperandLegal(*MI, Src1Idx, OtherOp)) {
// We didn't need the shift above, so we have an SGPR for the frame
// register, but may have a VGPR only operand.
@@ -2624,7 +2661,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
.addReg(MaterializedReg,
- MaterializedReg != FrameReg ? RegState::Kill : 0);
+ getKillRegState(MaterializedReg != FrameReg));
MaterializedReg = ScavengedVGPR;
}
@@ -2636,8 +2673,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
if (NumDefs == 2)
AddI32.add(MI->getOperand(1));
- unsigned MaterializedRegFlags =
- MaterializedReg != FrameReg ? RegState::Kill : 0;
+ RegState MaterializedRegFlags =
+ getKillRegState(MaterializedReg != FrameReg);
if (isVGPRClass(getPhysRegBaseClass(MaterializedReg))) {
// If we know we have a VGPR already, it's more likely the other
@@ -2767,7 +2804,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
if (OtherOp.isReg() && OtherOp.getReg() != DstOp.getReg())
TmpReg = DstOp.getReg();
- if (FrameReg && !ST.enableFlatScratch()) {
+ if (FrameReg && !ST.hasFlatScratchEnabled()) {
// FIXME: In the common case where the add does not also read its result
// (i.e. this isn't a reg += fi), it's not finding the dest reg as
// available.
@@ -2852,7 +2889,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
}
int64_t Offset = FrameInfo.getObjectOffset(Index);
- if (ST.enableFlatScratch()) {
+ if (ST.hasFlatScratchEnabled()) {
if (TII->isFLATScratch(*MI)) {
assert(
(int16_t)FIOperandNum ==
@@ -2954,10 +2991,36 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
: RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
MI, false, 0, !UseSGPR);
- // TODO: for flat scratch another attempt can be made with a VGPR index
- // if no SGPRs can be scavenged.
- if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR))
+ if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR)) {
+ int SVOpcode = AMDGPU::getFlatScratchInstSVfromSS(MI->getOpcode());
+ if (ST.hasFlatScratchSVSMode() && SVOpcode != -1) {
+ Register TmpVGPR = RS->scavengeRegisterBackwards(
+ AMDGPU::VGPR_32RegClass, MI, false, 0, /*AllowSpill=*/true);
+
+ // Materialize the frame register.
+ auto MIB =
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR);
+ if (FrameReg)
+ MIB.addReg(FrameReg);
+ else
+ MIB.addImm(Offset);
+
+ // Add the offset to the frame register.
+ if (FrameReg && Offset)
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32), FrameReg)
+ .addReg(FrameReg, RegState::Kill)
+ .addImm(Offset);
+
+ BuildMI(*MBB, MI, DL, TII->get(SVOpcode))
+ .add(MI->getOperand(0)) // $vdata
+ .addReg(TmpVGPR) // $vaddr
+ .addImm(0) // Offset
+ .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::cpol));
+ MI->eraseFromParent();
+ return true;
+ }
report_fatal_error("Cannot scavenge register in FI elimination!");
+ }
if (!TmpSReg) {
// Use frame register and restore it after.
@@ -3019,7 +3082,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
if (!IsMUBUF && !MFI->isBottomOfStack()) {
// Convert to a swizzled stack address by scaling by the wave size.
// In an entry function/kernel the offset is already swizzled.
- bool IsSALU = isSGPRClass(TII->getOpRegClass(*MI, FIOperandNum));
+ bool IsSALU = isSGPRClass(TII->getRegClass(MI->getDesc(), FIOperandNum));
bool LiveSCC = RS->isRegUsed(AMDGPU::SCC) &&
!MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr);
const TargetRegisterClass *RC = IsSALU && !LiveSCC
@@ -3531,6 +3594,17 @@ SIRegisterInfo::getVectorSuperClassForBitWidth(unsigned BitWidth) const {
}
const TargetRegisterClass *
+SIRegisterInfo::getDefaultVectorSuperClassForBitWidth(unsigned BitWidth) const {
+ // TODO: In principle this should use AV classes for gfx908 too. This is
+ // limited to 90a+ to avoid regressing special case copy optimizations which
+ // need new handling. The core issue is that it's not possible to directly
+ // copy between AGPRs on gfx908, and the current optimizations around that
+ // expect to see copies to VGPR.
+ return ST.hasGFX90AInsts() ? getVectorSuperClassForBitWidth(BitWidth)
+ : getVGPRClassForBitWidth(BitWidth);
+}
+
+const TargetRegisterClass *
SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) {
if (BitWidth == 16 || BitWidth == 32)
return &AMDGPU::SReg_32RegClass;
@@ -3601,6 +3675,14 @@ SIRegisterInfo::getEquivalentAGPRClass(const TargetRegisterClass *SRC) const {
}
const TargetRegisterClass *
+SIRegisterInfo::getEquivalentAVClass(const TargetRegisterClass *SRC) const {
+ unsigned Size = getRegSizeInBits(*SRC);
+ const TargetRegisterClass *ARC = getVectorSuperClassForBitWidth(Size);
+ assert(ARC && "Invalid register class size");
+ return ARC;
+}
+
+const TargetRegisterClass *
SIRegisterInfo::getEquivalentSGPRClass(const TargetRegisterClass *VRC) const {
unsigned Size = getRegSizeInBits(*VRC);
if (Size == 32)
@@ -3707,27 +3789,6 @@ bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI,
return RC && isAGPRClass(RC);
}
-bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
- const TargetRegisterClass *SrcRC,
- unsigned SubReg,
- const TargetRegisterClass *DstRC,
- unsigned DstSubReg,
- const TargetRegisterClass *NewRC,
- LiveIntervals &LIS) const {
- unsigned SrcSize = getRegSizeInBits(*SrcRC);
- unsigned DstSize = getRegSizeInBits(*DstRC);
- unsigned NewSize = getRegSizeInBits(*NewRC);
-
- // Do not increase size of registers beyond dword, we would need to allocate
- // adjacent registers and constraint regalloc more than needed.
-
- // Always allow dword coalescing.
- if (SrcSize <= 32 || DstSize <= 32)
- return true;
-
- return NewSize <= DstSize || NewSize <= SrcSize;
-}
-
unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
MachineFunction &MF) const {
unsigned MinOcc = ST.getOccupancyWithWorkGroupSizes(MF).first;
@@ -3761,10 +3822,10 @@ unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
llvm_unreachable("Unexpected register pressure set!");
}
-const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
+const int *SIRegisterInfo::getRegUnitPressureSets(MCRegUnit RegUnit) const {
static const int Empty[] = { -1 };
- if (RegPressureIgnoredUnits[RegUnit])
+ if (RegPressureIgnoredUnits[static_cast<unsigned>(RegUnit)])
return Empty;
return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit);
@@ -3888,20 +3949,6 @@ const TargetRegisterClass *SIRegisterInfo::getVGPR64Class() const {
: &AMDGPU::VReg_64RegClass;
}
-const TargetRegisterClass *
-SIRegisterInfo::getRegClass(unsigned RCID) const {
- switch ((int)RCID) {
- case AMDGPU::SReg_1RegClassID:
- return getBoolRC();
- case AMDGPU::SReg_1_XEXECRegClassID:
- return getWaveMaskRegClass();
- case -1:
- return nullptr;
- default:
- return AMDGPUGenRegisterInfo::getRegClass(RCID);
- }
-}
-
// Find reaching register definition
MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg,
MachineInstr &Use,
@@ -3990,28 +4037,6 @@ bool SIRegisterInfo::isProperlyAlignedRC(const TargetRegisterClass &RC) const {
return true;
}
-const TargetRegisterClass *
-SIRegisterInfo::getProperlyAlignedRC(const TargetRegisterClass *RC) const {
- if (!RC || !ST.needsAlignedVGPRs())
- return RC;
-
- unsigned Size = getRegSizeInBits(*RC);
- if (Size <= 32)
- return RC;
-
- if (RC == &AMDGPU::VS_64RegClass)
- return &AMDGPU::VS_64_Align2RegClass;
-
- if (isVGPRClass(RC))
- return getAlignedVGPRClassForBitWidth(Size);
- if (isAGPRClass(RC))
- return getAlignedAGPRClassForBitWidth(Size);
- if (isVectorSuperClass(RC))
- return getAlignedVectorSuperClassForBitWidth(Size);
-
- return RC;
-}
-
ArrayRef<MCPhysReg>
SIRegisterInfo::getAllSGPR128(const MachineFunction &MF) const {
return ArrayRef(AMDGPU::SGPR_128RegClass.begin(), ST.getMaxNumSGPRs(MF) / 4);
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 7b91ba7..9d1a9ea 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -107,9 +107,7 @@ public:
// Stack access is very expensive. CSRs are also the high registers, and we
// want to minimize the number of used registers.
- unsigned getCSRFirstUseCost() const override {
- return 100;
- }
+ unsigned getCSRCost() const override { return 100; }
// When building a block VGPR load, we only really transfer a subset of the
// registers in the block, based on a mask. Liveness analysis is not aware of
@@ -216,6 +214,10 @@ public:
getVectorSuperClassForBitWidth(unsigned BitWidth) const;
LLVM_READONLY
+ const TargetRegisterClass *
+ getDefaultVectorSuperClassForBitWidth(unsigned BitWidth) const;
+
+ LLVM_READONLY
static const TargetRegisterClass *getSGPRClassForBitWidth(unsigned BitWidth);
/// \returns true if this class contains only SGPR registers
@@ -285,6 +287,10 @@ public:
const TargetRegisterClass *
getEquivalentAGPRClass(const TargetRegisterClass *SRC) const;
+ /// \returns An AGPR+VGPR super reg class with the same width as \p SRC
+ const TargetRegisterClass *
+ getEquivalentAVClass(const TargetRegisterClass *SRC) const;
+
/// \returns A SGPR reg class with the same width as \p SRC
const TargetRegisterClass *
getEquivalentSGPRClass(const TargetRegisterClass *VRC) const;
@@ -338,14 +344,6 @@ public:
ArrayRef<int16_t> getRegSplitParts(const TargetRegisterClass *RC,
unsigned EltSize) const;
- bool shouldCoalesce(MachineInstr *MI,
- const TargetRegisterClass *SrcRC,
- unsigned SubReg,
- const TargetRegisterClass *DstRC,
- unsigned DstSubReg,
- const TargetRegisterClass *NewRC,
- LiveIntervals &LIS) const override;
-
unsigned getRegPressureLimit(const TargetRegisterClass *RC,
MachineFunction &MF) const override;
@@ -357,7 +355,7 @@ public:
const MachineFunction &MF, const VirtRegMap *VRM,
const LiveRegMatrix *Matrix) const override;
- const int *getRegUnitPressureSets(unsigned RegUnit) const override;
+ const int *getRegUnitPressureSets(MCRegUnit RegUnit) const override;
MCRegister getReturnAddressReg(const MachineFunction &MF) const;
@@ -391,8 +389,6 @@ public:
MCRegister getExec() const;
- const TargetRegisterClass *getRegClass(unsigned RCID) const;
-
// Find reaching register definition
MachineInstr *findReachingDef(Register Reg, unsigned SubReg,
MachineInstr &Use,
@@ -433,11 +429,6 @@ public:
// the subtarget.
bool isProperlyAlignedRC(const TargetRegisterClass &RC) const;
- // Given \p RC returns corresponding aligned register class if required
- // by the subtarget.
- const TargetRegisterClass *
- getProperlyAlignedRC(const TargetRegisterClass *RC) const;
-
/// Return all SGPR128 which satisfy the waves per execution unit requirement
/// of the subtarget.
ArrayRef<MCPhysReg> getAllSGPR128(const MachineFunction &MF) const;
@@ -495,6 +486,17 @@ public:
SmallVector<StringLiteral>
getVRegFlagsOfReg(Register Reg, const MachineFunction &MF) const override;
+
+ float
+ getSpillWeightScaleFactor(const TargetRegisterClass *RC) const override {
+ // Prioritize VGPR_32_Lo256 over other classes which may occupy registers
+ // beyond v256.
+ return AMDGPUGenRegisterInfo::getSpillWeightScaleFactor(RC) *
+ ((RC == &AMDGPU::VGPR_32_Lo256RegClass ||
+ RC == &AMDGPU::VReg_64_Lo256_Align2RegClass)
+ ? 2.0
+ : 1.0);
+ }
};
namespace AMDGPU {
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index fc8f46a..493e267 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -614,9 +614,9 @@ def VGPR_16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
def VGPR_16_Lo128 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
(add (interleave (sequence "VGPR%u_LO16", 0, 127),
(sequence "VGPR%u_HI16", 0, 127)))> {
+ let AllocationPriority = !add(2, !mul(BaseClassPriority, BaseClassScaleFactor));
let Size = 16;
let GeneratePressureSet = 0;
- let isAllocatable = 0;
// This is the base class for VGPR{0..127}_{LO16,HI16}.
let BaseClassOrder = 16;
@@ -644,7 +644,7 @@ def VGPR_32_Lo128 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg1
// Identical to VGPR_32 except it only contains the low 256 (Lo256) registers.
def VGPR_32_Lo256 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32,
(add (sequence "VGPR%u", 0, 255))> {
- let AllocationPriority = 0;
+ let AllocationPriority = !add(3, !mul(BaseClassPriority, BaseClassScaleFactor));
let GeneratePressureSet = 0;
let Size = 32;
let Weight = 1;
@@ -896,20 +896,6 @@ def SReg_64_Encodable : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v
let Size = 64;
}
-def SReg_1_XEXEC : SIRegisterClass<"AMDGPU", [i1], 32,
- (add SReg_64_XEXEC, SReg_32_XEXEC)> {
- let CopyCost = 1;
- let isAllocatable = 0;
- let HasSGPR = 1;
-}
-
-def SReg_1 : SIRegisterClass<"AMDGPU", [i1], 32,
- (add SReg_1_XEXEC, EXEC, EXEC_LO, EXEC_HI)> {
- let CopyCost = 1;
- let isAllocatable = 0;
- let HasSGPR = 1;
-}
-
multiclass SRegClass<int numRegs,
list<ValueType> regTypes,
SIRegisterTuples regList,
@@ -1205,6 +1191,34 @@ defm AV_512 : AVRegClass<16, VReg_512.RegTypes, (add VGPR_512), (add AGPR_512)>;
defm AV_1024 : AVRegClass<32, VReg_1024.RegTypes, (add VGPR_1024), (add AGPR_1024)>;
}
+def SReg_1_XEXEC : SIRegisterClassLike<0, false, false, true>,
+ RegClassByHwMode<
+ [DefaultMode_Wave64,
+ AlignedVGPRNoAGPRMode_Wave64,
+ AVAlign2LoadStoreMode,
+ DefaultMode_Wave32,
+ AlignedVGPRNoAGPRMode_Wave32],
+ [SReg_64_XEXEC,
+ SReg_64_XEXEC,
+ SReg_64_XEXEC,
+ SReg_32_XM0_XEXEC, // FIXME: Why do the wave32 cases exclude m0?
+ SReg_32_XM0_XEXEC]
+>;
+
+def SReg_1 : SIRegisterClassLike<0, false, false, true>,
+ RegClassByHwMode<
+ [DefaultMode_Wave64,
+ AlignedVGPRNoAGPRMode_Wave64,
+ AVAlign2LoadStoreMode,
+ DefaultMode_Wave32,
+ AlignedVGPRNoAGPRMode_Wave32],
+ [SReg_64,
+ SReg_64,
+ SReg_64,
+ SReg_32,
+ SReg_32]
+>;
+
//===----------------------------------------------------------------------===//
//
// AlignTarget classes. Artifical classes to swap between
@@ -1212,17 +1226,36 @@ defm AV_1024 : AVRegClass<32, VReg_1024.RegTypes, (add VGPR_1024), (add AGPR_102
//
//===----------------------------------------------------------------------===//
+// We have 3 orthogonal properties to consider. Unfortunately we need
+// to define the cross product of these states, minus unused
+// combinations.
+
def AV_LdSt_32_Target : RegClassByHwMode<
- [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode],
- [VGPR_32, AV_32, VGPR_32]>, SIRegisterClassLike<32, true, true> {
+ [DefaultMode_Wave64,
+ DefaultMode_Wave32,
+ AVAlign2LoadStoreMode,
+ AlignedVGPRNoAGPRMode_Wave64,
+ AlignedVGPRNoAGPRMode_Wave32],
+ [VGPR_32,
+ VGPR_32,
+ AV_32,
+ VGPR_32,
+ VGPR_32]>,
+ SIRegisterClassLike<32, true, true> {
let DecoderMethod = "decodeAVLdSt";
}
foreach RegSize = [ 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 512, 1024 ] in {
def VReg_#RegSize#_AlignTarget : SIRegisterClassLike<RegSize, true>,
RegClassByHwMode<
- [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode],
+ [DefaultMode_Wave64,
+ DefaultMode_Wave32,
+ AVAlign2LoadStoreMode,
+ AlignedVGPRNoAGPRMode_Wave64,
+ AlignedVGPRNoAGPRMode_Wave32],
[!cast<RegisterClass>("VReg_"#RegSize),
+ !cast<RegisterClass>("VReg_"#RegSize),
+ !cast<RegisterClass>("VReg_"#RegSize#_Align2),
!cast<RegisterClass>("VReg_"#RegSize#_Align2),
!cast<RegisterClass>("VReg_"#RegSize#_Align2)]> {
let DecoderMethod = "DecodeVReg_"#RegSize#"RegisterClass";
@@ -1230,45 +1263,59 @@ foreach RegSize = [ 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 512, 10
def AReg_#RegSize#_AlignTarget : SIRegisterClassLike<RegSize, false, true>,
RegClassByHwMode<
- [DefaultMode, AVAlign2LoadStoreMode, /*Unused combination*/],
+ [DefaultMode_Wave64, /*unused combination*/ AVAlign2LoadStoreMode, /*Unused combination*/ /*Unused combination*/],
[!cast<RegisterClass>("AReg_"#RegSize),
+ /*unused combination*/
!cast<RegisterClass>("AReg_"#RegSize#_Align2)
+ /*Unused combination*/
/*Unused combination*/]> {
let DecoderMethod = "DecodeAReg_"#RegSize#"RegisterClass";
}
def AV_#RegSize#_AlignTarget : SIRegisterClassLike<RegSize, true, true>,
RegClassByHwMode<
- [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode],
+ [DefaultMode_Wave32,
+ DefaultMode_Wave64,
+ AVAlign2LoadStoreMode,
+ AlignedVGPRNoAGPRMode_Wave64,
+ AlignedVGPRNoAGPRMode_Wave32],
[!cast<RegisterClass>("AV_"#RegSize),
+ !cast<RegisterClass>("AV_"#RegSize),
!cast<RegisterClass>("AV_"#RegSize#_Align2),
+ !cast<RegisterClass>("VReg_"#RegSize#_Align2),
!cast<RegisterClass>("VReg_"#RegSize#_Align2)]> {
let DecoderMethod = "DecodeAV_"#RegSize#"RegisterClass";
}
def AV_LdSt_#RegSize#_AlignTarget : SIRegisterClassLike<RegSize, true, true>,
RegClassByHwMode<
- [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode],
+ [DefaultMode_Wave64, DefaultMode_Wave32, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode_Wave64, AlignedVGPRNoAGPRMode_Wave32],
[!cast<RegisterClass>("VReg_"#RegSize),
+ !cast<RegisterClass>("VReg_"#RegSize),
!cast<RegisterClass>("AV_"#RegSize#_Align2),
+ !cast<RegisterClass>("VReg_"#RegSize#_Align2),
!cast<RegisterClass>("VReg_"#RegSize#_Align2)]> {
let DecoderMethod = "decodeAVLdSt";
}
def AV_LdSt_#RegSize#_Align2 : SIRegisterClassLike<RegSize, true, true>,
RegClassByHwMode<
- [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode],
+ [DefaultMode_Wave64, DefaultMode_Wave32, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode_Wave64, AlignedVGPRNoAGPRMode_Wave32],
[!cast<RegisterClass>("VReg_"#RegSize#_Align2),
+ !cast<RegisterClass>("VReg_"#RegSize#_Align2),
!cast<RegisterClass>("AV_"#RegSize#_Align2),
+ !cast<RegisterClass>("VReg_"#RegSize#_Align2),
!cast<RegisterClass>("VReg_"#RegSize#_Align2)]> {
let DecoderMethod = "decodeAVLdSt";
}
def AV_LdSt_#RegSize#_Align1 : SIRegisterClassLike<RegSize, true, true>,
RegClassByHwMode<
- [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode],
+ [DefaultMode_Wave64, DefaultMode_Wave32, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode_Wave64, AlignedVGPRNoAGPRMode_Wave32],
[!cast<RegisterClass>("VReg_"#RegSize),
+ !cast<RegisterClass>("VReg_"#RegSize),
!cast<RegisterClass>("AV_"#RegSize),
+ !cast<RegisterClass>("VReg_"#RegSize),
!cast<RegisterClass>("VReg_"#RegSize)]> {
let DecoderMethod = "decodeAVLdSt";
}
@@ -1276,11 +1323,22 @@ foreach RegSize = [ 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 512, 10
def VS_64_AlignTarget : SIRegisterClassLike<64, true, false, true>,
RegClassByHwMode<
- [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode],
- [VS_64, VS_64_Align2, VS_64_Align2]> {
+ [DefaultMode_Wave64, DefaultMode_Wave32, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode_Wave64, AlignedVGPRNoAGPRMode_Wave32],
+ [VS_64, VS_64, VS_64_Align2, VS_64_Align2, VS_64_Align2]> {
let DecoderMethod = "decodeSrcRegOrImm9";
}
+
+// Special case for DS_GWS instructions. The register input is really
+// 32-bit, but it needs to be even aligned on targets with a VGPR
+// alignment requirement.
+def AV_LdSt_32_Align2 : SIRegisterClassLike</*Bitwidth=*/32, /*VGPR=*/true, /*AGPR=*/true>,
+ RegClassByHwMode<
+ [DefaultMode_Wave64, DefaultMode_Wave32, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode_Wave64, AlignedVGPRNoAGPRMode_Wave32],
+ [VGPR_32, VGPR_32, AV_64_Align2, VReg_64_Align2, VReg_64_Align2]> {
+ let DecoderMethod = "decodeAVLdSt<32>";
+}
+
class RegImmMatcher<string name> : AsmOperandClass {
let Name = name;
let RenderMethod = "addRegOrImmOperands";
@@ -1314,12 +1372,12 @@ class SrcRegOrImm9_t16<string operandType, RegisterClass regClass = VS_16>
let EncoderMethod = "getMachineOpValueT16";
}
-def SSrc_b16 : SrcRegOrImm9 <SReg_32, "OPERAND_REG_IMM_INT16">;
-def SSrc_bf16: SrcRegOrImm9 <SReg_32, "OPERAND_REG_IMM_BF16">;
-def SSrc_f16 : SrcRegOrImm9 <SReg_32, "OPERAND_REG_IMM_FP16">;
-def SSrc_b32 : SrcRegOrImm9 <SReg_32, "OPERAND_REG_IMM_INT32">;
-def SSrc_f32 : SrcRegOrImm9 <SReg_32, "OPERAND_REG_IMM_FP32">;
-def SSrc_b64 : SrcRegOrImm9 <SReg_64_Encodable, "OPERAND_REG_IMM_INT64">;
+def SSrc_b16 : SrcRegOrImm9 <SReg_32, "OPERAND_REG_IMM_INT16">;
+def SSrc_bf16 : SrcRegOrImm9 <SReg_32, "OPERAND_REG_IMM_BF16">;
+def SSrc_f16 : SrcRegOrImm9 <SReg_32, "OPERAND_REG_IMM_FP16">;
+def SSrc_b32 : SrcRegOrImm9 <SReg_32, "OPERAND_REG_IMM_INT32">;
+def SSrc_f32 : SrcRegOrImm9 <SReg_32, "OPERAND_REG_IMM_FP32">;
+def SSrc_b64 : SrcRegOrImm9 <SReg_64_Encodable, "OPERAND_REG_IMM_INT64">;
def SSrcOrLds_b32 : SrcRegOrImm9 <SRegOrLds_32, "OPERAND_REG_IMM_INT32">;
@@ -1335,35 +1393,35 @@ def SCSrc_b64 : SrcRegOrImm9 <SReg_64, "OPERAND_REG_INLINE_C_INT64">;
//===----------------------------------------------------------------------===//
// The current and temporary future default used case for VOP3.
-def VSrc_b16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_INT16">;
-def VSrc_bf16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_BF16">;
-def VSrc_f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_FP16">;
+def VSrc_b16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_INT16">;
+def VSrc_bf16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_BF16">;
+def VSrc_f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_FP16">;
// True16 VOP3 operands.
-def VSrcT_b16 : SrcRegOrImm9_t16 <"OPERAND_REG_IMM_INT16">;
+def VSrcT_b16 : SrcRegOrImm9_t16 <"OPERAND_REG_IMM_INT16">;
def VSrcT_bf16 : SrcRegOrImm9_t16 <"OPERAND_REG_IMM_BF16">;
-def VSrcT_f16 : SrcRegOrImm9_t16 <"OPERAND_REG_IMM_FP16">;
+def VSrcT_f16 : SrcRegOrImm9_t16 <"OPERAND_REG_IMM_FP16">;
// True16 VOP1/2/C operands.
let DecoderMethodName = "decodeOperand_VSrcT16_Lo128", EncoderMethod = "getMachineOpValueT16Lo128" in {
- def VSrcT_b16_Lo128 : SrcRegOrImm9_t16 <"OPERAND_REG_IMM_INT16", VS_16_Lo128>;
- def VSrcT_bf16_Lo128 : SrcRegOrImm9_t16 <"OPERAND_REG_IMM_BF16", VS_16_Lo128>;
- def VSrcT_f16_Lo128 : SrcRegOrImm9_t16 <"OPERAND_REG_IMM_FP16", VS_16_Lo128>;
+ def VSrcT_b16_Lo128 : SrcRegOrImm9_t16 <"OPERAND_REG_IMM_INT16", VS_16_Lo128>;
+ def VSrcT_bf16_Lo128 : SrcRegOrImm9_t16 <"OPERAND_REG_IMM_BF16", VS_16_Lo128>;
+ def VSrcT_f16_Lo128 : SrcRegOrImm9_t16 <"OPERAND_REG_IMM_FP16", VS_16_Lo128>;
} // End DecoderMethodName = "decodeOperand_VSrcT16_Lo128", EncoderMethod = "getMachineOpValueT16Lo128"
// The current and temporary future default used case for fake VOP1/2/C.
// For VOP1,2,C True16 instructions. _Lo128 use first 128 32-bit VGPRs only.
-def VSrcFake16_b16_Lo128 : SrcRegOrImm9 <VS_32_Lo128, "OPERAND_REG_IMM_INT16">;
+def VSrcFake16_b16_Lo128 : SrcRegOrImm9 <VS_32_Lo128, "OPERAND_REG_IMM_INT16">;
def VSrcFake16_bf16_Lo128 : SrcRegOrImm9 <VS_32_Lo128, "OPERAND_REG_IMM_BF16">;
-def VSrcFake16_f16_Lo128 : SrcRegOrImm9 <VS_32_Lo128, "OPERAND_REG_IMM_FP16">;
+def VSrcFake16_f16_Lo128 : SrcRegOrImm9 <VS_32_Lo128, "OPERAND_REG_IMM_FP16">;
-def VSrc_b32 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_INT32">;
-def VSrc_f32 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_FP32">;
-def VSrc_v2b16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_V2INT16">;
+def VSrc_b32 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_INT32">;
+def VSrc_f32 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_FP32">;
+def VSrc_v2b16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_V2INT16">;
def VSrc_v2bf16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_V2BF16">;
-def VSrc_v2f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_V2FP16">;
-def VSrc_b64 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_IMM_INT64">;
-def VSrc_f64 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_IMM_FP64"> {
+def VSrc_v2f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_V2FP16">;
+def VSrc_b64 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_IMM_INT64">;
+def VSrc_f64 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_IMM_FP64"> {
let DecoderMethod = "decodeOperand_VSrc_f64";
}
def VSrc_v2b32 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_IMM_V2INT32">;
@@ -1371,6 +1429,8 @@ def VSrc_v2f32 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_IMM_V2FP32">;
def VSrc_NoInline_v2f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_NOINLINE_V2FP16">;
+def VSrc_v2f16_splat : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_V2FP16_SPLAT">;
+
//===----------------------------------------------------------------------===//
// VRegSrc_* Operands with a VGPR
//===----------------------------------------------------------------------===//
@@ -1381,15 +1441,15 @@ class SrcReg9<RegisterClassLike regClass> : RegisterOperand<regClass> {
let DecoderMethod = "decodeSrcReg9<" # !cast<SIRegisterClassLike>(regClass).Size # ">";
}
-def VRegSrc_32 : SrcReg9<VGPR_32>;
-def VRegSrc_64 : SrcReg9<VReg_64_AlignTarget>;
-def VRegSrc_96 : SrcReg9<VReg_96_AlignTarget>;
-def VRegSrc_128 : SrcReg9<VReg_128_AlignTarget>;
-def VRegSrc_192 : SrcReg9<VReg_192_AlignTarget>;
-def VRegSrc_256 : SrcReg9<VReg_256_AlignTarget>;
-def VRegSrc_384 : SrcReg9<VReg_384_AlignTarget>;
-def VRegSrc_512 : SrcReg9<VReg_512_AlignTarget>;
-def VRegSrc_1024 : SrcReg9<VReg_1024_AlignTarget>;
+def VRegSrc_32 : SrcReg9<VGPR_32>;
+def VRegSrc_64 : SrcReg9<VReg_64_AlignTarget>;
+def VRegSrc_96 : SrcReg9<VReg_96_AlignTarget>;
+def VRegSrc_128 : SrcReg9<VReg_128_AlignTarget>;
+def VRegSrc_192 : SrcReg9<VReg_192_AlignTarget>;
+def VRegSrc_256 : SrcReg9<VReg_256_AlignTarget>;
+def VRegSrc_384 : SrcReg9<VReg_384_AlignTarget>;
+def VRegSrc_512 : SrcReg9<VReg_512_AlignTarget>;
+def VRegSrc_1024 : SrcReg9<VReg_1024_AlignTarget>;
def VRegOrLdsSrc_32 : SrcReg9<VRegOrLds_32>;
// True 16 Operands
@@ -1454,44 +1514,44 @@ def ARegSrc_32 : AVOperand<AGPR_32, "decodeSrcA9">;
// VCSrc_* Operands with an SGPR, VGPR or an inline constant
//===----------------------------------------------------------------------===//
-def VCSrc_b16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_INT16">;
-def VCSrc_bf16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_BF16">;
-def VCSrc_f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_FP16">;
-def VCSrc_b32 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_INT32">;
-def VCSrc_f32 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_FP32">;
-def VCSrc_b64 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_INLINE_C_INT64">;
-def VCSrc_f64 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_INLINE_C_FP64">;
-def VCSrc_v2b16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2INT16">;
-def VCSrc_v2bf16: SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2BF16">;
-def VCSrc_v2f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2FP16">;
-def VCSrc_b32_Lo256 : SrcRegOrImm9 <VS_32_Lo256, "OPERAND_REG_INLINE_C_INT32">;
-def VCSrc_v2b32 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_INLINE_C_V2INT32">;
-def VCSrc_b64_Lo256 : SrcRegOrImm9 <VS_64_Lo256, "OPERAND_REG_INLINE_C_INT64">;
+def VCSrc_b16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_INT16">;
+def VCSrc_bf16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_BF16">;
+def VCSrc_f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_FP16">;
+def VCSrc_b32 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_INT32">;
+def VCSrc_f32 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_FP32">;
+def VCSrc_v2b16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2INT16">;
+def VCSrc_v2bf16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2BF16">;
+def VCSrc_v2f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2FP16">;
+def VCSrc_b32_Lo256 : SrcRegOrImm9 <VS_32_Lo256, "OPERAND_REG_INLINE_C_INT32">;
+def VCSrc_b64_Lo256 : SrcRegOrImm9 <VS_64_Lo256, "OPERAND_REG_INLINE_C_INT64">;
+def VCSrc_b64 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_INLINE_C_INT64">;
+def VCSrc_f64 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_INLINE_C_FP64">;
+def VCSrc_v2b32 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_INLINE_C_V2INT32">;
// True 16 Operands
-def VCSrcT_b16 : SrcRegOrImm9_t16 <"OPERAND_REG_INLINE_C_INT16">;
-def VCSrcT_bf16 : SrcRegOrImm9_t16 <"OPERAND_REG_INLINE_C_BF16">;
-def VCSrcT_f16 : SrcRegOrImm9_t16 <"OPERAND_REG_INLINE_C_FP16">;
+def VCSrcT_b16 : SrcRegOrImm9_t16 <"OPERAND_REG_INLINE_C_INT16">;
+def VCSrcT_bf16 : SrcRegOrImm9_t16 <"OPERAND_REG_INLINE_C_BF16">;
+def VCSrcT_f16 : SrcRegOrImm9_t16 <"OPERAND_REG_INLINE_C_FP16">;
//===----------------------------------------------------------------------===//
// VISrc_* Operands with a VGPR or an inline constant
//===----------------------------------------------------------------------===//
-def VISrc_64_bf16 : SrcRegOrImm9 <VReg_64_AlignTarget, "OPERAND_REG_INLINE_C_BF16">;
-def VISrc_64_f16 : SrcRegOrImm9 <VReg_64_AlignTarget, "OPERAND_REG_INLINE_C_FP16">;
-def VISrc_64_b32 : SrcRegOrImm9 <VReg_64_AlignTarget, "OPERAND_REG_INLINE_C_INT32">;
-def VISrc_64_f64 : SrcRegOrImm9 <VReg_64_AlignTarget, "OPERAND_REG_INLINE_C_FP64">;
-def VISrc_128_bf16 : SrcRegOrImm9 <VReg_128_AlignTarget, "OPERAND_REG_INLINE_C_BF16">;
-def VISrc_128_f16 : SrcRegOrImm9 <VReg_128_AlignTarget, "OPERAND_REG_INLINE_C_FP16">;
-def VISrc_128_b32 : SrcRegOrImm9 <VReg_128_AlignTarget, "OPERAND_REG_INLINE_C_INT32">;
-def VISrc_128_f32 : SrcRegOrImm9 <VReg_128_AlignTarget, "OPERAND_REG_INLINE_C_FP32">;
-def VISrc_256_b32 : SrcRegOrImm9 <VReg_256_AlignTarget, "OPERAND_REG_INLINE_C_INT32">;
-def VISrc_256_f32 : SrcRegOrImm9 <VReg_256_AlignTarget, "OPERAND_REG_INLINE_C_FP32">;
-def VISrc_256_f64 : SrcRegOrImm9 <VReg_256_AlignTarget, "OPERAND_REG_INLINE_C_FP64">;
-def VISrc_512_b32 : SrcRegOrImm9 <VReg_512_AlignTarget, "OPERAND_REG_INLINE_C_INT32">;
-def VISrc_512_f32 : SrcRegOrImm9 <VReg_512_AlignTarget, "OPERAND_REG_INLINE_C_FP32">;
-def VISrc_512_f64 : SrcRegOrImm9 <VReg_512_AlignTarget, "OPERAND_REG_INLINE_C_FP64">;
-def VISrc_1024_b32 : SrcRegOrImm9 <VReg_1024_AlignTarget, "OPERAND_REG_INLINE_C_INT32">;
-def VISrc_1024_f32 : SrcRegOrImm9 <VReg_1024_AlignTarget, "OPERAND_REG_INLINE_C_FP32">;
+def VISrc_64_bf16 : SrcRegOrImm9 <VReg_64_AlignTarget, "OPERAND_REG_INLINE_C_BF16">;
+def VISrc_64_f16 : SrcRegOrImm9 <VReg_64_AlignTarget, "OPERAND_REG_INLINE_C_FP16">;
+def VISrc_64_b32 : SrcRegOrImm9 <VReg_64_AlignTarget, "OPERAND_REG_INLINE_C_INT32">;
+def VISrc_64_f64 : SrcRegOrImm9 <VReg_64_AlignTarget, "OPERAND_REG_INLINE_C_FP64">;
+def VISrc_128_bf16 : SrcRegOrImm9 <VReg_128_AlignTarget, "OPERAND_REG_INLINE_C_BF16">;
+def VISrc_128_f16 : SrcRegOrImm9 <VReg_128_AlignTarget, "OPERAND_REG_INLINE_C_FP16">;
+def VISrc_128_b32 : SrcRegOrImm9 <VReg_128_AlignTarget, "OPERAND_REG_INLINE_C_INT32">;
+def VISrc_128_f32 : SrcRegOrImm9 <VReg_128_AlignTarget, "OPERAND_REG_INLINE_C_FP32">;
+def VISrc_256_b32 : SrcRegOrImm9 <VReg_256_AlignTarget, "OPERAND_REG_INLINE_C_INT32">;
+def VISrc_256_f32 : SrcRegOrImm9 <VReg_256_AlignTarget, "OPERAND_REG_INLINE_C_FP32">;
+def VISrc_256_f64 : SrcRegOrImm9 <VReg_256_AlignTarget, "OPERAND_REG_INLINE_C_FP64">;
+def VISrc_512_b32 : SrcRegOrImm9 <VReg_512_AlignTarget, "OPERAND_REG_INLINE_C_INT32">;
+def VISrc_512_f32 : SrcRegOrImm9 <VReg_512_AlignTarget, "OPERAND_REG_INLINE_C_FP32">;
+def VISrc_512_f64 : SrcRegOrImm9 <VReg_512_AlignTarget, "OPERAND_REG_INLINE_C_FP64">;
+def VISrc_1024_b32 : SrcRegOrImm9 <VReg_1024_AlignTarget, "OPERAND_REG_INLINE_C_INT32">;
+def VISrc_1024_f32 : SrcRegOrImm9 <VReg_1024_AlignTarget, "OPERAND_REG_INLINE_C_FP32">;
//===----------------------------------------------------------------------===//
// AVSrc_*, AVDst_*, AVLdSt_* Operands with an AGPR or VGPR
@@ -1500,13 +1560,13 @@ def VISrc_1024_f32 : SrcRegOrImm9 <VReg_1024_AlignTarget, "OPERAND_REG_INLINE_C_
class AVSrcOperand<RegisterClassLike regClass>
: AVOperand<regClass, "decodeSrcAV10">;
-def AVSrc_32 : AVSrcOperand<AV_32>;
-def AVSrc_64 : AVSrcOperand<AV_64_AlignTarget>;
+def AVSrc_32 : AVSrcOperand<AV_32>;
+def AVSrc_64 : AVSrcOperand<AV_64_AlignTarget>;
def AVSrc_128 : AVSrcOperand<AV_128_AlignTarget>;
def AVSrc_192 : AVSrcOperand<AV_192_AlignTarget>;
def AVSrc_256 : AVSrcOperand<AV_256_AlignTarget>;
-def AVSrc_64_Align2 : AVSrcOperand<AV_64_Align2>;
+def AVSrc_64_Align2 : AVSrcOperand<AV_64_Align2>;
def AVSrc_128_Align2 : AVSrcOperand<AV_128_Align2>;
def AVSrc_192_Align2 : AVSrcOperand<AV_192_Align2>;
def AVSrc_256_Align2 : AVSrcOperand<AV_256_Align2>;
@@ -1528,11 +1588,22 @@ class AVLdStOperand<RegisterClassLike regClass>
def AVLdSt_32 : AVLdStOperand<AV_LdSt_32_Target>;
foreach size = ["64", "96", "128", "160", "256", "1024" ] in {
- def AVLdSt_#size : AVLdStOperand<!cast<RegisterClassLike>("AV_LdSt_"#size#_AlignTarget)>;
+ def AVLdSt_#size : AVLdStOperand<!cast<RegisterClassLike>("AV_LdSt_"#size#_AlignTarget)>;
def AVLdSt_#size#_Align1 : AVLdStOperand<!cast<RegisterClassLike>("AV_LdSt_"#size#_Align1)>;
def AVLdSt_#size#_Align2 : AVLdStOperand<!cast<RegisterClassLike>("AV_LdSt_"#size#_Align2)>;
}
+def AV_LdSt_32_Align2_RegMatcher : AsmOperandClass {
+ let Name = "AV_LdSt_32_Align2_RegOp";
+ let RenderMethod = "addRegOperands";
+}
+
+def AV_LdSt_32_Align2_RegOp : RegisterOperand<AV_LdSt_32_Align2> {
+ let ParserMatchClass = AV_LdSt_32_Align2_RegMatcher;
+ let PrintMethod = "printAVLdSt32Align2RegOp";
+ let EncoderMethod = "getAVOperandEncoding";
+}
+
//===----------------------------------------------------------------------===//
// ACSrc_* Operands with an AGPR or an inline constant
//===----------------------------------------------------------------------===//
@@ -1542,14 +1613,14 @@ class SrcRegOrImmA9<RegisterClassLike regClass, string operandType>
let DecoderMethod = "decodeSrcRegOrImmA9<" # !cast<SIRegisterClassLike>(regClass).Size # ">";
}
-def AISrc_64_f64 : SrcRegOrImmA9 <AReg_64_AlignTarget, "OPERAND_REG_INLINE_AC_FP64">;
-def AISrc_128_f32 : SrcRegOrImmA9 <AReg_128_AlignTarget, "OPERAND_REG_INLINE_AC_FP32">;
-def AISrc_128_b32 : SrcRegOrImmA9 <AReg_128_AlignTarget, "OPERAND_REG_INLINE_AC_INT32">;
-def AISrc_256_f64 : SrcRegOrImmA9 <AReg_256_AlignTarget, "OPERAND_REG_INLINE_AC_FP64">;
-def AISrc_512_f32 : SrcRegOrImmA9 <AReg_512_AlignTarget, "OPERAND_REG_INLINE_AC_FP32">;
-def AISrc_512_b32 : SrcRegOrImmA9 <AReg_512_AlignTarget, "OPERAND_REG_INLINE_AC_INT32">;
-def AISrc_1024_f32 : SrcRegOrImmA9 <AReg_1024_AlignTarget, "OPERAND_REG_INLINE_AC_FP32">;
-def AISrc_1024_b32 : SrcRegOrImmA9 <AReg_1024_AlignTarget, "OPERAND_REG_INLINE_AC_INT32">;
+def AISrc_64_f64 : SrcRegOrImmA9 <AReg_64_AlignTarget, "OPERAND_REG_INLINE_AC_FP64">;
+def AISrc_128_f32 : SrcRegOrImmA9 <AReg_128_AlignTarget, "OPERAND_REG_INLINE_AC_FP32">;
+def AISrc_128_b32 : SrcRegOrImmA9 <AReg_128_AlignTarget, "OPERAND_REG_INLINE_AC_INT32">;
+def AISrc_256_f64 : SrcRegOrImmA9 <AReg_256_AlignTarget, "OPERAND_REG_INLINE_AC_FP64">;
+def AISrc_512_f32 : SrcRegOrImmA9 <AReg_512_AlignTarget, "OPERAND_REG_INLINE_AC_FP32">;
+def AISrc_512_b32 : SrcRegOrImmA9 <AReg_512_AlignTarget, "OPERAND_REG_INLINE_AC_INT32">;
+def AISrc_1024_f32 : SrcRegOrImmA9 <AReg_1024_AlignTarget, "OPERAND_REG_INLINE_AC_FP32">;
+def AISrc_1024_b32 : SrcRegOrImmA9 <AReg_1024_AlignTarget, "OPERAND_REG_INLINE_AC_INT32">;
//===----------------------------------------------------------------------===//
// Tablegen programming utilities
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 179ecba..14ed778 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -27,6 +27,8 @@ using namespace llvm;
namespace {
+enum ChangeKind { None, UpdateHint, UpdateInst };
+
class SIShrinkInstructions {
MachineFunction *MF;
MachineRegisterInfo *MRI;
@@ -41,10 +43,10 @@ class SIShrinkInstructions {
bool isKUImmOperand(const MachineOperand &Src) const;
bool isKImmOrKUImmOperand(const MachineOperand &Src, bool &IsUnsigned) const;
void copyExtraImplicitOps(MachineInstr &NewMI, MachineInstr &MI) const;
- void shrinkScalarCompare(MachineInstr &MI) const;
- void shrinkMIMG(MachineInstr &MI) const;
- void shrinkMadFma(MachineInstr &MI) const;
- bool shrinkScalarLogicOp(MachineInstr &MI) const;
+ bool shrinkScalarCompare(MachineInstr &MI) const;
+ bool shrinkMIMG(MachineInstr &MI) const;
+ bool shrinkMadFma(MachineInstr &MI) const;
+ ChangeKind shrinkScalarLogicOp(MachineInstr &MI) const;
bool tryReplaceDeadSDST(MachineInstr &MI) const;
bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R,
Register Reg, unsigned SubReg) const;
@@ -241,27 +243,30 @@ void SIShrinkInstructions::copyExtraImplicitOps(MachineInstr &NewMI,
}
}
-void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const {
+bool SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const {
if (!ST->hasSCmpK())
- return;
+ return false;
// cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to
// get constants on the RHS.
- if (!MI.getOperand(0).isReg())
- TII->commuteInstruction(MI, false, 0, 1);
+ bool Changed = false;
+ if (!MI.getOperand(0).isReg()) {
+ if (TII->commuteInstruction(MI, false, 0, 1))
+ Changed = true;
+ }
// cmpk requires src0 to be a register
const MachineOperand &Src0 = MI.getOperand(0);
if (!Src0.isReg())
- return;
+ return Changed;
MachineOperand &Src1 = MI.getOperand(1);
if (!Src1.isImm())
- return;
+ return Changed;
int SOPKOpc = AMDGPU::getSOPKOp(MI.getOpcode());
if (SOPKOpc == -1)
- return;
+ return Changed;
// eq/ne is special because the imm16 can be treated as signed or unsigned,
// and initially selected to the unsigned versions.
@@ -275,9 +280,10 @@ void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const {
}
MI.setDesc(TII->get(SOPKOpc));
+ Changed = true;
}
- return;
+ return Changed;
}
const MCInstrDesc &NewDesc = TII->get(SOPKOpc);
@@ -287,14 +293,16 @@ void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const {
if (!SIInstrInfo::sopkIsZext(SOPKOpc))
Src1.setImm(SignExtend64(Src1.getImm(), 32));
MI.setDesc(NewDesc);
+ Changed = true;
}
+ return Changed;
}
// Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding.
-void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const {
+bool SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const {
const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
if (!Info)
- return;
+ return false;
uint8_t NewEncoding;
switch (Info->MIMGEncoding) {
@@ -305,7 +313,7 @@ void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const {
NewEncoding = AMDGPU::MIMGEncGfx11Default;
break;
default:
- return;
+ return false;
}
int VAddr0Idx =
@@ -359,7 +367,7 @@ void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const {
} else if (Vgpr == NextVgpr) {
NextVgpr = Vgpr + Dwords;
} else {
- return;
+ return false;
}
if (!Op.isUndef())
@@ -369,7 +377,7 @@ void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const {
}
if (VgprBase + NewAddrDwords > 256)
- return;
+ return false;
// Further check for implicit tied operands - this may be present if TFE is
// enabled
@@ -408,21 +416,22 @@ void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const {
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata),
ToUntie - (EndVAddr - 1));
}
+ return true;
}
// Shrink MAD to MADAK/MADMK and FMA to FMAAK/FMAMK.
-void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
+bool SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
// Pre-GFX10 VOP3 instructions like MAD/FMA cannot take a literal operand so
// there is no reason to try to shrink them.
if (!ST->hasVOP3Literal())
- return;
+ return false;
// There is no advantage to doing this pre-RA.
if (!IsPostRA)
- return;
+ return false;
if (TII->hasAnyModifiersSet(MI))
- return;
+ return false;
const unsigned Opcode = MI.getOpcode();
MachineOperand &Src0 = *TII->getNamedOperand(MI, AMDGPU::OpName::src0);
@@ -439,7 +448,7 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
else if (Src0.isReg() && TRI->isVGPR(*MRI, Src0.getReg()))
Swap = true;
else
- return;
+ return false;
switch (Opcode) {
default:
@@ -477,7 +486,7 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
else if (Src0.isImm() && !TII->isInlineConstant(Src0))
Swap = true;
else
- return;
+ return false;
switch (Opcode) {
default:
@@ -509,10 +518,10 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
}
if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END)
- return;
+ return false;
if (AMDGPU::isTrue16Inst(NewOpcode) && !shouldShrinkTrue16(MI))
- return;
+ return false;
if (Swap) {
// Swap Src0 and Src1 by building a new instruction.
@@ -527,14 +536,17 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
TII->removeModOperands(MI);
MI.setDesc(TII->get(NewOpcode));
}
+ return true;
}
/// Attempt to shrink AND/OR/XOR operations requiring non-inlineable literals.
/// For AND or OR, try using S_BITSET{0,1} to clear or set bits.
/// If the inverse of the immediate is legal, use ANDN2, ORN2 or
/// XNOR (as a ^ b == ~(a ^ ~b)).
-/// \returns true if the caller should continue the machine function iterator
-bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const {
+/// \return ChangeKind::None if no changes were made.
+/// ChangeKind::UpdateHint if regalloc hints were updated.
+/// ChangeKind::UpdateInst if the instruction was modified.
+ChangeKind SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const {
unsigned Opc = MI.getOpcode();
const MachineOperand *Dest = &MI.getOperand(0);
MachineOperand *Src0 = &MI.getOperand(1);
@@ -544,13 +556,14 @@ bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const {
if (!SrcImm->isImm() ||
AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST->hasInv2PiInlineImm()))
- return false;
+ return ChangeKind::None;
uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm());
uint32_t NewImm = 0;
if (Opc == AMDGPU::S_AND_B32) {
- if (isPowerOf2_32(~Imm)) {
+ if (isPowerOf2_32(~Imm) &&
+ MI.findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr)->isDead()) {
NewImm = llvm::countr_one(Imm);
Opc = AMDGPU::S_BITSET0_B32;
} else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) {
@@ -558,7 +571,8 @@ bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const {
Opc = AMDGPU::S_ANDN2_B32;
}
} else if (Opc == AMDGPU::S_OR_B32) {
- if (isPowerOf2_32(Imm)) {
+ if (isPowerOf2_32(Imm) &&
+ MI.findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr)->isDead()) {
NewImm = llvm::countr_zero(Imm);
Opc = AMDGPU::S_BITSET1_B32;
} else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) {
@@ -578,13 +592,13 @@ bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const {
if (Dest->getReg().isVirtual() && SrcReg->isReg()) {
MRI->setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg());
MRI->setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg());
- return true;
+ return ChangeKind::UpdateHint;
}
if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) {
const bool IsUndef = SrcReg->isUndef();
const bool IsKill = SrcReg->isKill();
- MI.setDesc(TII->get(Opc));
+ TII->mutateAndCleanupImplicit(MI, TII->get(Opc));
if (Opc == AMDGPU::S_BITSET0_B32 ||
Opc == AMDGPU::S_BITSET1_B32) {
Src0->ChangeToImmediate(NewImm);
@@ -596,10 +610,11 @@ bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const {
} else {
SrcImm->setImm(NewImm);
}
+ return ChangeKind::UpdateInst;
}
}
- return false;
+ return ChangeKind::None;
}
// This is the same as MachineInstr::readsRegister/modifiesRegister except
@@ -791,10 +806,10 @@ MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
Y1 = getSubRegForIndex(Y, Ysub, I);
auto *MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(),
TII->get(AMDGPU::V_SWAP_B32))
- .addDef(X1.Reg, 0, X1.SubReg)
- .addDef(Y1.Reg, 0, Y1.SubReg)
- .addReg(Y1.Reg, 0, Y1.SubReg)
- .addReg(X1.Reg, 0, X1.SubReg)
+ .addDef(X1.Reg, {}, X1.SubReg)
+ .addDef(Y1.Reg, {}, Y1.SubReg)
+ .addReg(Y1.Reg, {}, Y1.SubReg)
+ .addReg(X1.Reg, {}, X1.SubReg)
.getInstr();
Swaps.push_back(MIB);
}
@@ -854,6 +869,7 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
IsPostRA = MF.getProperties().hasNoVRegs();
unsigned VCCReg = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
+ bool Changed = false;
for (MachineBasicBlock &MBB : MF) {
MachineBasicBlock::iterator I, Next;
@@ -877,6 +893,7 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
if (ModOpcode != 0) {
MI.setDesc(TII->get(ModOpcode));
Src.setImm(static_cast<int64_t>(ModImm));
+ Changed = true;
continue;
}
}
@@ -887,20 +904,35 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
MI.getOpcode() == AMDGPU::COPY)) {
if (auto *NextMI = matchSwap(MI)) {
Next = NextMI->getIterator();
+ Changed = true;
continue;
}
}
+ // Shrink scalar logic operations.
+ if (MI.getOpcode() == AMDGPU::S_AND_B32 ||
+ MI.getOpcode() == AMDGPU::S_OR_B32 ||
+ MI.getOpcode() == AMDGPU::S_XOR_B32) {
+ ChangeKind CK = shrinkScalarLogicOp(MI);
+ if (CK == ChangeKind::UpdateHint)
+ continue;
+ Changed |= (CK == ChangeKind::UpdateInst);
+ }
+
// Try to use S_ADDK_I32 and S_MULK_I32.
if (MI.getOpcode() == AMDGPU::S_ADD_I32 ||
- MI.getOpcode() == AMDGPU::S_MUL_I32) {
+ MI.getOpcode() == AMDGPU::S_MUL_I32 ||
+ (MI.getOpcode() == AMDGPU::S_OR_B32 &&
+ MI.getFlag(MachineInstr::MIFlag::Disjoint))) {
const MachineOperand *Dest = &MI.getOperand(0);
MachineOperand *Src0 = &MI.getOperand(1);
MachineOperand *Src1 = &MI.getOperand(2);
if (!Src0->isReg() && Src1->isReg()) {
- if (TII->commuteInstruction(MI, false, 1, 2))
+ if (TII->commuteInstruction(MI, false, 1, 2)) {
std::swap(Src0, Src1);
+ Changed = true;
+ }
}
// FIXME: This could work better if hints worked with subregisters. If
@@ -911,22 +943,22 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
MRI->setRegAllocationHint(Src0->getReg(), 0, Dest->getReg());
continue;
}
-
if (Src0->isReg() && Src0->getReg() == Dest->getReg()) {
if (Src1->isImm() && isKImmOperand(*Src1)) {
- unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ?
- AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32;
-
+ unsigned Opc = (MI.getOpcode() == AMDGPU::S_MUL_I32)
+ ? AMDGPU::S_MULK_I32
+ : AMDGPU::S_ADDK_I32;
Src1->setImm(SignExtend64(Src1->getImm(), 32));
MI.setDesc(TII->get(Opc));
MI.tieOperands(0, 1);
+ Changed = true;
}
}
}
// Try to use s_cmpk_*
if (MI.isCompare() && TII->isSOPC(MI)) {
- shrinkScalarCompare(MI);
+ Changed |= shrinkScalarCompare(MI);
continue;
}
@@ -941,27 +973,21 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
if (isKImmOperand(Src)) {
MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
Src.setImm(SignExtend64(Src.getImm(), 32));
+ Changed = true;
} else if ((ModOpc = canModifyToInlineImmOp32(TII, Src, ModImm,
/*Scalar=*/true))) {
MI.setDesc(TII->get(ModOpc));
Src.setImm(static_cast<int64_t>(ModImm));
+ Changed = true;
}
}
continue;
}
- // Shrink scalar logic operations.
- if (MI.getOpcode() == AMDGPU::S_AND_B32 ||
- MI.getOpcode() == AMDGPU::S_OR_B32 ||
- MI.getOpcode() == AMDGPU::S_XOR_B32) {
- if (shrinkScalarLogicOp(MI))
- continue;
- }
-
if (IsPostRA && TII->isMIMG(MI.getOpcode()) &&
ST->getGeneration() >= AMDGPUSubtarget::GFX10) {
- shrinkMIMG(MI);
+ Changed |= shrinkMIMG(MI);
continue;
}
@@ -977,14 +1003,14 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_fake16_e64 ||
(MI.getOpcode() == AMDGPU::V_FMA_F64_e64 &&
ST->hasFmaakFmamkF64Insts())) {
- shrinkMadFma(MI);
+ Changed |= shrinkMadFma(MI);
continue;
}
// If there is no chance we will shrink it and use VCC as sdst to get
// a 32 bit form try to replace dead sdst with NULL.
if (TII->isVOP3(MI.getOpcode())) {
- tryReplaceDeadSDST(MI);
+ Changed |= tryReplaceDeadSDST(MI);
if (!TII->hasVALU32BitEncoding(MI.getOpcode())) {
continue;
}
@@ -995,9 +1021,12 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
// it.
if (!MI.isCommutable() || !TII->commuteInstruction(MI) ||
!TII->canShrink(MI, *MRI)) {
- tryReplaceDeadSDST(MI);
+ Changed |= tryReplaceDeadSDST(MI);
continue;
}
+
+ // Operands were commuted.
+ Changed = true;
}
int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
@@ -1101,9 +1130,10 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
foldImmediates(*Inst32);
LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n');
+ Changed = true;
}
}
- return false;
+ return Changed;
}
bool SIShrinkInstructionsLegacy::runOnMachineFunction(MachineFunction &MF) {
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 6611e1e..5fd0c1e 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -188,8 +188,9 @@ private:
void markInstruction(MachineInstr &MI, char Flag,
std::vector<WorkItem> &Worklist);
- void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg,
- unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist);
+ void markDefs(const MachineInstr &UseMI, LiveRange &LR,
+ VirtRegOrUnit VRegOrUnit, unsigned SubReg, char Flag,
+ std::vector<WorkItem> &Worklist);
void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag,
std::vector<WorkItem> &Worklist);
void markInstructionUses(const MachineInstr &MI, char Flag,
@@ -318,8 +319,8 @@ void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
/// Mark all relevant definitions of register \p Reg in usage \p UseMI.
void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
- Register Reg, unsigned SubReg, char Flag,
- std::vector<WorkItem> &Worklist) {
+ VirtRegOrUnit VRegOrUnit, unsigned SubReg,
+ char Flag, std::vector<WorkItem> &Worklist) {
LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
@@ -331,8 +332,9 @@ void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
// cover registers.
const LaneBitmask UseLanes =
SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
- : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg)
- : LaneBitmask::getNone());
+ : (VRegOrUnit.isVirtualReg()
+ ? MRI->getMaxLaneMaskForVReg(VRegOrUnit.asVirtualReg())
+ : LaneBitmask::getNone());
// Perform a depth-first iteration of the LiveRange graph marking defs.
// Stop processing of a given branch when all use lanes have been defined.
@@ -382,11 +384,11 @@ void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
assert(MI && "Def has no defining instruction");
- if (Reg.isVirtual()) {
+ if (VRegOrUnit.isVirtualReg()) {
// Iterate over all operands to find relevant definitions
bool HasDef = false;
for (const MachineOperand &Op : MI->all_defs()) {
- if (Op.getReg() != Reg)
+ if (Op.getReg() != VRegOrUnit.asVirtualReg())
continue;
// Compute lanes defined and overlap with use
@@ -453,7 +455,7 @@ void SIWholeQuadMode::markOperand(const MachineInstr &MI,
<< " for " << MI);
if (Reg.isVirtual()) {
LiveRange &LR = LIS->getInterval(Reg);
- markDefs(MI, LR, Reg, Op.getSubReg(), Flag, Worklist);
+ markDefs(MI, LR, VirtRegOrUnit(Reg), Op.getSubReg(), Flag, Worklist);
} else {
// Handle physical registers that we need to track; this is mostly relevant
// for VCC, which can appear as the (implicit) input of a uniform branch,
@@ -462,7 +464,8 @@ void SIWholeQuadMode::markOperand(const MachineInstr &MI,
LiveRange &LR = LIS->getRegUnit(Unit);
const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
if (Value)
- markDefs(MI, LR, Unit, AMDGPU::NoSubRegister, Flag, Worklist);
+ markDefs(MI, LR, VirtRegOrUnit(Unit), AMDGPU::NoSubRegister, Flag,
+ Worklist);
}
}
}
@@ -1101,10 +1104,15 @@ MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
LiveRange &LR =
LIS->getRegUnit(*TRI->regunits(MCRegister::from(AMDGPU::SCC)).begin());
auto MBBE = MBB.end();
- SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
- : LIS->getMBBEndIdx(&MBB);
- SlotIndex LastIdx =
- Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
+ // Skip debug instructions when getting slot indices, as they don't have
+ // entries in the slot index map.
+ auto FirstNonDbg = skipDebugInstructionsForward(First, MBBE);
+ auto LastNonDbg = skipDebugInstructionsForward(Last, MBBE);
+ SlotIndex FirstIdx = FirstNonDbg != MBBE
+ ? LIS->getInstructionIndex(*FirstNonDbg)
+ : LIS->getMBBEndIdx(&MBB);
+ SlotIndex LastIdx = LastNonDbg != MBBE ? LIS->getInstructionIndex(*LastNonDbg)
+ : LIS->getMBBEndIdx(&MBB);
SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
const LiveRange::Segment *S;
@@ -1121,8 +1129,8 @@ MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
} else {
MachineInstr *EndMI = LIS->getInstructionFromIndex(S->end.getBaseIndex());
assert(EndMI && "Segment does not end on valid instruction");
- auto NextI = std::next(EndMI->getIterator());
- if (NextI == MBB.end())
+ auto NextI = next_nodbg(EndMI->getIterator(), MBB.instr_end());
+ if (NextI == MBB.instr_end())
break;
SlotIndex Next = LIS->getInstructionIndex(*NextI);
if (Next > LastIdx)
@@ -1176,16 +1184,17 @@ void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
}
}
+ const DebugLoc &DL = MBB.findDebugLoc(Before);
MachineInstr *MI;
if (SaveWQM) {
unsigned Opcode =
IsTerminator ? LMC.AndSaveExecTermOpc : LMC.AndSaveExecOpc;
- MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), SaveWQM)
- .addReg(LiveMaskReg);
+ MI =
+ BuildMI(MBB, Before, DL, TII->get(Opcode), SaveWQM).addReg(LiveMaskReg);
} else {
unsigned Opcode = IsTerminator ? LMC.AndTermOpc : LMC.AndOpc;
- MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), LMC.ExecReg)
+ MI = BuildMI(MBB, Before, DL, TII->get(Opcode), LMC.ExecReg)
.addReg(LMC.ExecReg)
.addReg(LiveMaskReg);
}
@@ -1197,13 +1206,14 @@ void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
MachineBasicBlock::iterator Before,
Register SavedWQM) {
+ const DebugLoc &DL = MBB.findDebugLoc(Before);
MachineInstr *MI;
if (SavedWQM) {
- MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), LMC.ExecReg)
+ MI = BuildMI(MBB, Before, DL, TII->get(AMDGPU::COPY), LMC.ExecReg)
.addReg(SavedWQM);
} else {
- MI = BuildMI(MBB, Before, DebugLoc(), TII->get(LMC.WQMOpc), LMC.ExecReg)
+ MI = BuildMI(MBB, Before, DL, TII->get(LMC.WQMOpc), LMC.ExecReg)
.addReg(LMC.ExecReg);
}
@@ -1219,13 +1229,13 @@ void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,
assert(StrictStateNeeded == StateStrictWWM ||
StrictStateNeeded == StateStrictWQM);
+ const DebugLoc &DL = MBB.findDebugLoc(Before);
+
if (StrictStateNeeded == StateStrictWWM) {
- MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM),
- SaveOrig)
+ MI = BuildMI(MBB, Before, DL, TII->get(AMDGPU::ENTER_STRICT_WWM), SaveOrig)
.addImm(-1);
} else {
- MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WQM),
- SaveOrig)
+ MI = BuildMI(MBB, Before, DL, TII->get(AMDGPU::ENTER_STRICT_WQM), SaveOrig)
.addImm(-1);
}
LIS->InsertMachineInstrInMaps(*MI);
@@ -1242,14 +1252,16 @@ void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
assert(CurrentStrictState == StateStrictWWM ||
CurrentStrictState == StateStrictWQM);
+ const DebugLoc &DL = MBB.findDebugLoc(Before);
+
if (CurrentStrictState == StateStrictWWM) {
- MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM),
- LMC.ExecReg)
- .addReg(SavedOrig);
+ MI =
+ BuildMI(MBB, Before, DL, TII->get(AMDGPU::EXIT_STRICT_WWM), LMC.ExecReg)
+ .addReg(SavedOrig);
} else {
- MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WQM),
- LMC.ExecReg)
- .addReg(SavedOrig);
+ MI =
+ BuildMI(MBB, Before, DL, TII->get(AMDGPU::EXIT_STRICT_WQM), LMC.ExecReg)
+ .addReg(SavedOrig);
}
LIS->InsertMachineInstrInMaps(*MI);
StateTransition[MI] = NonStrictState;
@@ -1629,7 +1641,7 @@ void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
}
// Insert instruction sequence at block beginning (before vector operations).
- const DebugLoc DL = MI.getDebugLoc();
+ const DebugLoc &DL = MI.getDebugLoc();
const unsigned WavefrontSize = ST->getWavefrontSize();
const unsigned Mask = (WavefrontSize << 1) - 1;
Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index 781c61b0..ee8d29c 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -1464,7 +1464,7 @@ class SMEM_Real_gfx12Plus<bits<6> op, SM_Pseudo ps, string opName,
class SMEM_Real_gfx12<bits<6> op, SM_Pseudo ps, string opName = ps.Mnemonic> :
SMEM_Real_gfx12Plus<op, ps, opName, SIEncodingFamily.GFX12,
SGPR_NULL_gfx11plus> {
- let AssemblerPredicate = isGFX12Plus;
+ let AssemblerPredicate = isGFX12Only;
let DecoderNamespace = "GFX12";
let Inst{5-0} = !if(ps.has_sbase, sbase{6-1}, ?);
@@ -1537,3 +1537,84 @@ multiclass SMEM_Real_Probe_gfx12<bits<6> op> {
defm S_ATC_PROBE : SMEM_Real_Probe_gfx12<0x22>;
defm S_ATC_PROBE_BUFFER : SMEM_Real_Probe_gfx12<0x23>;
+
+//===----------------------------------------------------------------------===//
+// GFX13.
+//===----------------------------------------------------------------------===//
+
+class SMEM_Real_gfx13<bits<6> op, SM_Pseudo ps, string opName = ps.Mnemonic> :
+ SMEM_Real_gfx12Plus<op, ps, opName, SIEncodingFamily.GFX13,
+ SGPR_NULL_gfx11plus> {
+ let AssemblerPredicate = isGFX13Plus;
+ let DecoderNamespace = "GFX13";
+
+ let Inst{5-0} = !if(ps.has_sbase, sbase{6-1}, ?);
+ let Inst{12-6} = !if(ps.has_sdst, sdst{6-0}, ?);
+}
+
+class SMEM_Real_Prefetch_gfx13<bits<6> op, SM_Pseudo ps> :
+ SMEM_Real_gfx13<op, ps> {
+ bits<7> sdata; // Only 5 bits of sdata are supported.
+
+ let sdst = ?;
+ let Inst{12-11} = 0; // Unused sdata bits.
+ let Inst{10-6} = !if(ps.has_sdst, sdata{4-0}, ?);
+}
+
+class SMEM_Real_Load_gfx13<bits<6> op, string ps, string opName, OffsetMode offsets> :
+ SMEM_Real_gfx13<op, !cast<SM_Pseudo>(ps # offsets.Variant), opName> {
+ RegisterClass BaseClass = !cast<SM_Load_Pseudo>(ps # offsets.Variant).BaseClass;
+ let InOperandList = !con((ins BaseClass:$sbase), offsets.Ins, (ins CPol:$cpol));
+
+ let Inst{20} = cpol{CPolBit.NV}; // non-volatile
+ let Inst{22-21} = cpol{4-3}; // scope
+ let Inst{24-23} = cpol{1-0}; // th - only lower 2 bits are supported
+ let Inst{56} = cpol{CPolBit.SCAL}; // scale offset
+}
+
+multiclass SM_Real_Loads_gfx13<bits<6> op, string ps = NAME> {
+ defvar opName = !tolower(NAME);
+ def _IMM_gfx13 : SMEM_Real_Load_gfx13<op, ps, opName, IMM_Offset>;
+ def _SGPR_IMM_gfx13 : SMEM_Real_Load_gfx13<op, ps, opName, SGPR_IMM_OptOffset>;
+}
+
+defm S_LOAD_B32 : SM_Real_Loads_gfx13<0x00, "S_LOAD_DWORD">;
+defm S_LOAD_B64 : SM_Real_Loads_gfx13<0x01, "S_LOAD_DWORDX2">;
+defm S_LOAD_B96 : SM_Real_Loads_gfx13<0x0e, "S_LOAD_DWORDX3">;
+defm S_LOAD_B128 : SM_Real_Loads_gfx13<0x02, "S_LOAD_DWORDX4">;
+defm S_LOAD_B256 : SM_Real_Loads_gfx13<0x03, "S_LOAD_DWORDX8">;
+defm S_LOAD_B512 : SM_Real_Loads_gfx13<0x04, "S_LOAD_DWORDX16">;
+
+defm S_LOAD_I8 : SM_Real_Loads_gfx13<0x30>;
+defm S_LOAD_U8 : SM_Real_Loads_gfx13<0x31>;
+defm S_LOAD_I16 : SM_Real_Loads_gfx13<0x32>;
+defm S_LOAD_U16 : SM_Real_Loads_gfx13<0x33>;
+
+defm S_BUFFER_LOAD_B32 : SM_Real_Loads_gfx13<0x08, "S_BUFFER_LOAD_DWORD">;
+defm S_BUFFER_LOAD_B64 : SM_Real_Loads_gfx13<0x09, "S_BUFFER_LOAD_DWORDX2">;
+defm S_BUFFER_LOAD_B96 : SM_Real_Loads_gfx13<0x0d, "S_BUFFER_LOAD_DWORDX3">;
+defm S_BUFFER_LOAD_B128 : SM_Real_Loads_gfx13<0x0a, "S_BUFFER_LOAD_DWORDX4">;
+defm S_BUFFER_LOAD_B256 : SM_Real_Loads_gfx13<0x0b, "S_BUFFER_LOAD_DWORDX8">;
+defm S_BUFFER_LOAD_B512 : SM_Real_Loads_gfx13<0x0c, "S_BUFFER_LOAD_DWORDX16">;
+
+defm S_BUFFER_LOAD_I8 : SM_Real_Loads_gfx13<0x34>;
+defm S_BUFFER_LOAD_U8 : SM_Real_Loads_gfx13<0x35>;
+defm S_BUFFER_LOAD_I16 : SM_Real_Loads_gfx13<0x36>;
+defm S_BUFFER_LOAD_U16 : SM_Real_Loads_gfx13<0x37>;
+
+def S_DCACHE_INV_gfx13 : SMEM_Real_gfx13<0x020, S_DCACHE_INV>;
+
+def S_PREFETCH_INST_gfx13 : SMEM_Real_Prefetch_gfx13<0x22, S_PREFETCH_INST>;
+def S_PREFETCH_INST_PC_REL_gfx13 : SMEM_Real_Prefetch_gfx13<0x23, S_PREFETCH_INST_PC_REL>;
+def S_PREFETCH_DATA_gfx13 : SMEM_Real_Prefetch_gfx13<0x2c, S_PREFETCH_DATA>;
+def S_BUFFER_PREFETCH_DATA_gfx13 : SMEM_Real_Prefetch_gfx13<0x2d, S_BUFFER_PREFETCH_DATA>;
+def S_PREFETCH_DATA_PC_REL_gfx13 : SMEM_Real_Prefetch_gfx13<0x2e, S_PREFETCH_DATA_PC_REL>;
+
+multiclass SMEM_Real_Probe_gfx13<bits<6> op> {
+ defvar ps = NAME;
+ def _IMM_gfx13 : SMEM_Real_Prefetch_gfx13<op, !cast<SM_Probe_Pseudo>(ps#_IMM)>;
+ def _SGPR_IMM_gfx13 : SMEM_Real_Prefetch_gfx13<op, !cast<SM_Probe_Pseudo>(ps#_SGPR_OPT_IMM)>;
+}
+
+defm S_ATC_PROBE : SMEM_Real_Probe_gfx13<0x26>;
+defm S_ATC_PROBE_BUFFER : SMEM_Real_Probe_gfx13<0x27>;
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 84287b6..ce6e862 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -433,8 +433,10 @@ let SubtargetPredicate = isGFX11Plus in {
} // End SubtargetPredicate = isGFX11Plus
let SubtargetPredicate = isGFX12Plus in {
- let hasSideEffects = 1, Defs = [SCC] in {
- def S_ALLOC_VGPR : SOP1_0_32 <"s_alloc_vgpr">;
+ let hasSideEffects = 1, isConvergent = 1, Defs = [SCC] in {
+ def S_ALLOC_VGPR : SOP1_0_32 <"s_alloc_vgpr",
+ [(set SCC, (int_amdgcn_s_alloc_vgpr SSrc_b32:$src0))]
+ >;
}
} // End SubtargetPredicate = isGFX12Plus
@@ -469,6 +471,25 @@ let SubtargetPredicate = HasSALUFloatInsts, Uses = [MODE],
} // End SubtargetPredicate = HasSALUFloatInsts, Uses = [MODE]
// SchedRW = [WriteSFPU], isReMaterializable = 1
+let SubtargetPredicate = HasSALUFloatInsts, AddedComplexity = 9 in {
+ // Fallback patterns for f32->i16 conversion.
+ def : GCNPat<(i16 (UniformUnaryFrag<fp_to_sint> f32:$src0)),
+ (S_CVT_I32_F32 $src0)>;
+ def : GCNPat<(i16 (UniformUnaryFrag<fp_to_uint> f32:$src0)),
+ (S_CVT_U32_F32 $src0)>;
+ // f16 -> i32 : form chain f16 -> f32 -> i32
+ def : GCNPat<(i32 (UniformUnaryFrag<fp_to_sint> f16:$src0)),
+ (S_CVT_I32_F32 (S_CVT_F32_F16 $src0))>;
+ def : GCNPat<(i32 (UniformUnaryFrag<fp_to_uint> f16:$src0)),
+ (S_CVT_U32_F32 (S_CVT_F32_F16 $src0))>;
+
+ // i32 -> f16 : form chain i32 -> f32 -> f16
+ def : GCNPat<(f16 (UniformUnaryFrag<sint_to_fp> i32:$src0)),
+ (S_CVT_F16_F32 (S_CVT_F32_I32 $src0))>;
+ def : GCNPat<(f16 (UniformUnaryFrag<uint_to_fp> i32:$src0)),
+ (S_CVT_F16_F32 (S_CVT_F32_U32 $src0))>;
+}
+
let hasSideEffects = 1 in {
let has_sdst = 0 in {
let Uses = [M0] in {
@@ -504,6 +525,12 @@ def S_BARRIER_JOIN_M0 : SOP1_Pseudo <"s_barrier_join m0", (outs), (ins),
let isConvergent = 1;
}
+def S_WAKEUP_BARRIER_M0 : SOP1_Pseudo <"s_wakeup_barrier m0", (outs), (ins),
+ "", []>{
+ let SchedRW = [WriteBarrier];
+ let isConvergent = 1;
+ let SubtargetPredicate = HasSWakeupBarrier;
+}
} // End Uses = [M0]
def S_BARRIER_SIGNAL_IMM : SOP1_Pseudo <"s_barrier_signal", (outs),
@@ -527,6 +554,12 @@ def S_BARRIER_JOIN_IMM : SOP1_Pseudo <"s_barrier_join", (outs),
let isConvergent = 1;
}
+def S_WAKEUP_BARRIER_IMM : SOP1_Pseudo <"s_wakeup_barrier", (outs),
+ (ins SplitBarrier:$src0), "$src0", []>{
+ let SchedRW = [WriteBarrier];
+ let isConvergent = 1;
+ let SubtargetPredicate = HasSWakeupBarrier;
+}
} // End has_sdst = 0
def S_GET_BARRIER_STATE_IMM : SOP1_Pseudo <"s_get_barrier_state", (outs SSrc_b32:$sdst),
@@ -838,9 +871,10 @@ def S_CBRANCH_G_FORK : SOP2_Pseudo <
let SubtargetPredicate = isGFX6GFX7GFX8GFX9;
}
-let Defs = [SCC] in {
-def S_ABSDIFF_I32 : SOP2_32 <"s_absdiff_i32">;
-} // End Defs = [SCC]
+let isCommutable = 1, Defs = [SCC] in
+def S_ABSDIFF_I32 : SOP2_32 <"s_absdiff_i32",
+ [(set i32:$sdst, (UniformUnaryFrag<abs> (sub_oneuse i32:$src0, i32:$src1)))]
+>;
let SubtargetPredicate = isGFX8GFX9 in {
def S_RFE_RESTORE_B64 : SOP2_Pseudo <
@@ -1618,23 +1652,34 @@ def S_BARRIER_WAIT : SOPP_Pseudo <"s_barrier_wait", (ins i16imm:$simm16), "$simm
let isConvergent = 1;
}
+
+let SchedRW = [WriteBarrier], isConvergent = 1 in {
+ let SubtargetPredicate = isGFX12Only in
def S_BARRIER_LEAVE : SOPP_Pseudo <"s_barrier_leave",
(ins), "", [(int_amdgcn_s_barrier_leave (i16 srcvalue))] > {
- let SchedRW = [WriteBarrier];
- let simm16 = 0;
- let fixed_imm = 1;
- let isConvergent = 1;
- let Defs = [SCC];
+ let simm16 = 0;
+ let fixed_imm = 1;
+ let Defs = [SCC];
+ }
+
+ let SubtargetPredicate = HasSBarrierLeaveImm in
+ def S_BARRIER_LEAVE_IMM : SOPP_Pseudo <"s_barrier_leave",
+ (ins i16imm:$simm16), "$simm16", [(int_amdgcn_s_barrier_leave timm:$simm16)]>;
}
def S_WAKEUP : SOPP_Pseudo <"s_wakeup", (ins) > {
- let SubtargetPredicate = isGFX8Plus;
+ let SubtargetPredicate = isGFX8GFX9GFX10GFX11GFX12;
let simm16 = 0;
let fixed_imm = 1;
let mayLoad = 1;
let mayStore = 1;
}
+let SubtargetPredicate = HasSWakeupImm in {
+ def S_WAKEUP_imm : SOPP_Pseudo <"s_wakeup",
+ (ins i16imm:$simm16), "$simm16">;
+} // End SubtargetPredicate = HasSWakeupImm
+
let SubtargetPredicate = isNotGFX1250Plus in {
def S_WAITCNT : SOPP_Pseudo <"s_waitcnt" , (ins SWaitCnt:$simm16), "$simm16",
[(int_amdgcn_s_waitcnt timm:$simm16)]>;
@@ -1667,11 +1712,21 @@ let SubtargetPredicate = HasWaitXcnt in {
// Represents the point at which a wave must wait for all outstanding direct loads to LDS.
// Typically inserted by the memory legalizer and consumed by SIInsertWaitcnts.
-
def S_WAITCNT_lds_direct : SPseudoInstSI<(outs), (ins)> {
let hasSideEffects = 0;
}
+let SubtargetPredicate = HasVMemToLDSLoad in {
+def ASYNCMARK : SPseudoInstSI<(outs), (ins),
+ [(int_amdgcn_asyncmark)]> {
+ let maybeAtomic = 0;
+}
+def WAIT_ASYNCMARK : SOPP_Pseudo <"", (ins s16imm:$simm16), "$simm16",
+ [(int_amdgcn_wait_asyncmark timm:$simm16)]> {
+ let maybeAtomic = 0;
+}
+}
+
def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16",
[(int_amdgcn_s_sethalt timm:$simm16)]>;
def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16"> {
@@ -1791,8 +1846,8 @@ let SubtargetPredicate = isGFX10Plus in {
let SubtargetPredicate = isGFX11Plus in {
let OtherPredicates = [HasExportInsts] in
- def S_WAIT_EVENT : SOPP_Pseudo<"s_wait_event", (ins s16imm:$simm16),
- "$simm16"> {
+ def S_WAIT_EVENT : SOPP_Pseudo<"s_wait_event", (ins WaitEvent:$simm16),
+ "$simm16", [(int_amdgcn_s_wait_event timm:$simm16)]> {
let hasSideEffects = 1;
}
def S_DELAY_ALU : SOPP_Pseudo<"s_delay_alu", (ins SDelayALU:$simm16),
@@ -1915,9 +1970,7 @@ def : GCNPat<
(S_SEXT_I32_I16 $src)
>;
-let SubtargetPredicate = isNotGFX12Plus in
- def : GCNPat <(int_amdgcn_s_wait_event_export_ready), (S_WAIT_EVENT (i16 0))>;
-let SubtargetPredicate = isGFX12Plus in
+let SubtargetPredicate = isGFX11Plus in
def : GCNPat <(int_amdgcn_s_wait_event_export_ready), (S_WAIT_EVENT (i16 2))>;
// The first 10 bits of the mode register are the core FP mode on all
@@ -2091,7 +2144,34 @@ class Select_gfx6_gfx7<string opName> : SIMCInstr<opName, SIEncodingFamily.SI> {
}
//===----------------------------------------------------------------------===//
-// SOP1 - GFX11, GFX12
+// SOP1 - GFX13
+//===----------------------------------------------------------------------===//
+
+multiclass SOP1_Real_gfx13<bits<8> op, string name = !tolower(NAME)> {
+ defvar ps = !cast<SOP1_Pseudo>(NAME);
+ def _gfx13 : SOP1_Real<op, ps, name>,
+ Select<GFX13Gen, ps.Mnemonic>;
+ if !ne(ps.Mnemonic, name) then
+ def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX13Only]>;
+}
+
+multiclass SOP1_M0_Real_gfx13<bits<8> op> {
+ defvar ps = !cast<SOP1_Pseudo>(NAME);
+ def _gfx13 : SOP1_Real<op, ps>, Select<GFX13Gen, ps.PseudoInstr> {
+ let Inst{7-0} = M0_gfx11plus.HWEncoding{7-0}; // Set Src0 encoding to M0
+ }
+}
+
+multiclass SOP1_IMM_Real_gfx13<bits<8> op> {
+ defvar ps = !cast<SOP1_Pseudo>(NAME);
+ def _gfx13 : SOP1_Real<op, ps>,
+ Select<GFX13Gen, ps.PseudoInstr>;
+}
+
+defm S_GET_SHADER_CYCLES_U64 : SOP1_Real_gfx13<0x011>;
+
+//===----------------------------------------------------------------------===//
+// SOP1 - GFX11, GFX12, GFX13
//===----------------------------------------------------------------------===//
multiclass SOP1_Real_gfx11<bits<8> op, string name = !tolower(NAME)> {
@@ -2110,23 +2190,29 @@ multiclass SOP1_Real_gfx12<bits<8> op, string name = !tolower(NAME)> {
Select<GFX12Gen, ps.PseudoInstr>;
if !ne(ps.Mnemonic, name) then
def : AMDGPUMnemonicAlias<ps.Mnemonic, name> {
- let AssemblerPredicate = isGFX12Plus;
+ let AssemblerPredicate = isGFX12Only;
}
}
multiclass SOP1_M0_Real_gfx12<bits<8> op> {
- def _gfx12 : SOP1_Real<op, !cast<SOP1_Pseudo>(NAME)>,
- Select<GFX12Gen, !cast<SOP1_Pseudo>(NAME).PseudoInstr> {
+ defvar ps = !cast<SOP1_Pseudo>(NAME);
+ def _gfx12 : SOP1_Real<op, ps>, Select<GFX12Gen, ps.PseudoInstr> {
let Inst{7-0} = M0_gfx11plus.HWEncoding{7-0}; // Set Src0 encoding to M0
}
}
+multiclass SOP1_M0_Real_gfx12_gfx13<bits<8> op> :
+ SOP1_M0_Real_gfx12<op>, SOP1_M0_Real_gfx13<op>;
+
multiclass SOP1_IMM_Real_gfx12<bits<8> op> {
defvar ps = !cast<SOP1_Pseudo>(NAME);
def _gfx12 : SOP1_Real<op, ps>,
Select<GFX12Gen, ps.PseudoInstr>;
}
+multiclass SOP1_IMM_Real_gfx12_gfx13<bits<8> op> :
+ SOP1_IMM_Real_gfx12<op>, SOP1_IMM_Real_gfx13<op>;
+
multiclass SOP1_Real_gfx11_gfx12<bits<8> op, string name = !tolower(NAME)> :
SOP1_Real_gfx11<op, name>, SOP1_Real_gfx12<op, name>;
@@ -2139,6 +2225,12 @@ multiclass SOP1_Real_gfx1250<bits<8> op, string name = !tolower(NAME)> {
def : AMDGPUMnemonicAlias<ps.Mnemonic, name>;
}
+multiclass SOP1_Real_gfx11_gfx12_gfx13<bits<8> op> :
+ SOP1_Real_gfx11<op>, SOP1_Real_gfx12<op>, SOP1_Real_gfx13<op>;
+
+multiclass SOP1_Real_gfx12_gfx13<bits<8> op> :
+ SOP1_Real_gfx12<op>, SOP1_Real_gfx13<op>;
+
defm S_MOV_B32 : SOP1_Real_gfx11_gfx12<0x000>;
defm S_MOV_B64 : SOP1_Real_gfx11_gfx12<0x001>;
defm S_CMOV_B32 : SOP1_Real_gfx11_gfx12<0x002>;
@@ -2207,47 +2299,49 @@ defm S_GETPC_B64 : SOP1_Real_gfx1250<0x047, "s_get_pc_i64">;
defm S_SETPC_B64 : SOP1_Real_gfx1250<0x048, "s_set_pc_i64">;
defm S_SWAPPC_B64 : SOP1_Real_gfx1250<0x049, "s_swap_pc_i64">;
defm S_RFE_B64 : SOP1_Real_gfx1250<0x04a, "s_rfe_i64">;
-defm S_SENDMSG_RTN_B32 : SOP1_Real_gfx11_gfx12<0x04c>;
-defm S_SENDMSG_RTN_B64 : SOP1_Real_gfx11_gfx12<0x04d>;
-defm S_BARRIER_SIGNAL_M0 : SOP1_M0_Real_gfx12<0x04e>;
-defm S_BARRIER_SIGNAL_ISFIRST_M0 : SOP1_M0_Real_gfx12<0x04f>;
-defm S_GET_BARRIER_STATE_M0 : SOP1_M0_Real_gfx12<0x050>;
-defm S_BARRIER_INIT_M0 : SOP1_M0_Real_gfx12<0x051>;
-defm S_BARRIER_JOIN_M0 : SOP1_M0_Real_gfx12<0x052>;
-defm S_BARRIER_SIGNAL_IMM : SOP1_IMM_Real_gfx12<0x04e>;
-defm S_BARRIER_SIGNAL_ISFIRST_IMM : SOP1_IMM_Real_gfx12<0x04f>;
-defm S_GET_BARRIER_STATE_IMM : SOP1_IMM_Real_gfx12<0x050>;
-defm S_BARRIER_INIT_IMM : SOP1_IMM_Real_gfx12<0x051>;
-defm S_BARRIER_JOIN_IMM : SOP1_IMM_Real_gfx12<0x052>;
-defm S_ALLOC_VGPR : SOP1_Real_gfx12<0x053>;
-defm S_SLEEP_VAR : SOP1_IMM_Real_gfx12<0x058>;
-
-// GFX1250
+defm S_SENDMSG_RTN_B32 : SOP1_Real_gfx11_gfx12_gfx13<0x04c>;
+defm S_SENDMSG_RTN_B64 : SOP1_Real_gfx11_gfx12_gfx13<0x04d>;
+defm S_BARRIER_SIGNAL_M0 : SOP1_M0_Real_gfx12_gfx13<0x04e>;
+defm S_BARRIER_SIGNAL_ISFIRST_M0 : SOP1_M0_Real_gfx12_gfx13<0x04f>;
+defm S_GET_BARRIER_STATE_M0 : SOP1_M0_Real_gfx12_gfx13<0x050>;
+defm S_BARRIER_INIT_M0 : SOP1_M0_Real_gfx12_gfx13<0x051>;
+defm S_BARRIER_JOIN_M0 : SOP1_M0_Real_gfx12_gfx13<0x052>;
+defm S_BARRIER_SIGNAL_IMM : SOP1_IMM_Real_gfx12_gfx13<0x04e>;
+defm S_BARRIER_SIGNAL_ISFIRST_IMM : SOP1_IMM_Real_gfx12_gfx13<0x04f>;
+defm S_GET_BARRIER_STATE_IMM : SOP1_IMM_Real_gfx12_gfx13<0x050>;
+defm S_BARRIER_INIT_IMM : SOP1_IMM_Real_gfx12_gfx13<0x051>;
+defm S_BARRIER_JOIN_IMM : SOP1_IMM_Real_gfx12_gfx13<0x052>;
+defm S_ALLOC_VGPR : SOP1_Real_gfx12_gfx13<0x053>;
+defm S_SLEEP_VAR : SOP1_IMM_Real_gfx12_gfx13<0x058>;
+
+// GFX1250, GFX13
defm S_GET_SHADER_CYCLES_U64 : SOP1_Real_gfx12<0x06>;
-defm S_ADD_PC_I64 : SOP1_Real_gfx12<0x04b>;
+defm S_ADD_PC_I64 : SOP1_Real_gfx12_gfx13<0x04b>;
+defm S_WAKEUP_BARRIER_M0 : SOP1_M0_Real_gfx12_gfx13<0x057>;
+defm S_WAKEUP_BARRIER_IMM : SOP1_IMM_Real_gfx12_gfx13<0x057>;
//===----------------------------------------------------------------------===//
-// SOP1 - GFX1150, GFX12
+// SOP1 - GFX1150, GFX12, GFX13
//===----------------------------------------------------------------------===//
-defm S_CEIL_F32 : SOP1_Real_gfx11_gfx12<0x060>;
-defm S_FLOOR_F32 : SOP1_Real_gfx11_gfx12<0x061>;
-defm S_TRUNC_F32 : SOP1_Real_gfx11_gfx12<0x062>;
-defm S_RNDNE_F32 : SOP1_Real_gfx11_gfx12<0x063>;
-defm S_CVT_F32_I32 : SOP1_Real_gfx11_gfx12<0x064>;
-defm S_CVT_F32_U32 : SOP1_Real_gfx11_gfx12<0x065>;
-defm S_CVT_I32_F32 : SOP1_Real_gfx11_gfx12<0x066>;
-defm S_CVT_U32_F32 : SOP1_Real_gfx11_gfx12<0x067>;
-defm S_CVT_F16_F32 : SOP1_Real_gfx11_gfx12<0x068>;
-defm S_CVT_F32_F16 : SOP1_Real_gfx11_gfx12<0x069>;
-defm S_CVT_HI_F32_F16 : SOP1_Real_gfx11_gfx12<0x06a>;
-defm S_CEIL_F16 : SOP1_Real_gfx11_gfx12<0x06b>;
-defm S_FLOOR_F16 : SOP1_Real_gfx11_gfx12<0x06c>;
-defm S_TRUNC_F16 : SOP1_Real_gfx11_gfx12<0x06d>;
-defm S_RNDNE_F16 : SOP1_Real_gfx11_gfx12<0x06e>;
+defm S_CEIL_F32 : SOP1_Real_gfx11_gfx12_gfx13<0x060>;
+defm S_FLOOR_F32 : SOP1_Real_gfx11_gfx12_gfx13<0x061>;
+defm S_TRUNC_F32 : SOP1_Real_gfx11_gfx12_gfx13<0x062>;
+defm S_RNDNE_F32 : SOP1_Real_gfx11_gfx12_gfx13<0x063>;
+defm S_CVT_F32_I32 : SOP1_Real_gfx11_gfx12_gfx13<0x064>;
+defm S_CVT_F32_U32 : SOP1_Real_gfx11_gfx12_gfx13<0x065>;
+defm S_CVT_I32_F32 : SOP1_Real_gfx11_gfx12_gfx13<0x066>;
+defm S_CVT_U32_F32 : SOP1_Real_gfx11_gfx12_gfx13<0x067>;
+defm S_CVT_F16_F32 : SOP1_Real_gfx11_gfx12_gfx13<0x068>;
+defm S_CVT_F32_F16 : SOP1_Real_gfx11_gfx12_gfx13<0x069>;
+defm S_CVT_HI_F32_F16 : SOP1_Real_gfx11_gfx12_gfx13<0x06a>;
+defm S_CEIL_F16 : SOP1_Real_gfx11_gfx12_gfx13<0x06b>;
+defm S_FLOOR_F16 : SOP1_Real_gfx11_gfx12_gfx13<0x06c>;
+defm S_TRUNC_F16 : SOP1_Real_gfx11_gfx12_gfx13<0x06d>;
+defm S_RNDNE_F16 : SOP1_Real_gfx11_gfx12_gfx13<0x06e>;
//===----------------------------------------------------------------------===//
-// SOP1 - GFX10.
+// SOP1 - GFX10, GFX13
//===----------------------------------------------------------------------===//
multiclass SOP1_Real_gfx10<bits<8> op> {
@@ -2256,30 +2350,33 @@ multiclass SOP1_Real_gfx10<bits<8> op> {
Select<GFX10Gen, ps.PseudoInstr>;
}
-multiclass SOP1_Real_gfx10_gfx11_gfx12<bits<8> op> :
- SOP1_Real_gfx10<op>, SOP1_Real_gfx11_gfx12<op>;
+multiclass SOP1_Real_gfx10_gfx13<bits<8> op> :
+ SOP1_Real_gfx10<op>, SOP1_Real_gfx13<op>;
-defm S_ANDN1_SAVEEXEC_B64 : SOP1_Real_gfx10<0x037>;
-defm S_ORN1_SAVEEXEC_B64 : SOP1_Real_gfx10<0x038>;
-defm S_ANDN1_WREXEC_B64 : SOP1_Real_gfx10<0x039>;
-defm S_ANDN2_WREXEC_B64 : SOP1_Real_gfx10<0x03a>;
-defm S_BITREPLICATE_B64_B32 : SOP1_Real_gfx10<0x03b>;
-defm S_AND_SAVEEXEC_B32 : SOP1_Real_gfx10<0x03c>;
-defm S_OR_SAVEEXEC_B32 : SOP1_Real_gfx10<0x03d>;
-defm S_XOR_SAVEEXEC_B32 : SOP1_Real_gfx10<0x03e>;
-defm S_ANDN2_SAVEEXEC_B32 : SOP1_Real_gfx10<0x03f>;
-defm S_ORN2_SAVEEXEC_B32 : SOP1_Real_gfx10<0x040>;
-defm S_NAND_SAVEEXEC_B32 : SOP1_Real_gfx10<0x041>;
-defm S_NOR_SAVEEXEC_B32 : SOP1_Real_gfx10<0x042>;
-defm S_XNOR_SAVEEXEC_B32 : SOP1_Real_gfx10<0x043>;
-defm S_ANDN1_SAVEEXEC_B32 : SOP1_Real_gfx10<0x044>;
-defm S_ORN1_SAVEEXEC_B32 : SOP1_Real_gfx10<0x045>;
-defm S_ANDN1_WREXEC_B32 : SOP1_Real_gfx10<0x046>;
-defm S_ANDN2_WREXEC_B32 : SOP1_Real_gfx10<0x047>;
-defm S_MOVRELSD_2_B32 : SOP1_Real_gfx10<0x049>;
+multiclass SOP1_Real_gfx10_Renamed_gfx13<bits<8> op, string gfx13_name> :
+ SOP1_Real_gfx10<op>, SOP1_Real_gfx13<op, gfx13_name>;
+
+defm S_ANDN1_SAVEEXEC_B64 : SOP1_Real_gfx10_Renamed_gfx13<0x37, "s_and_not0_saveexec_b64">;
+defm S_ORN1_SAVEEXEC_B64 : SOP1_Real_gfx10_Renamed_gfx13<0x38, "s_or_not0_saveexec_b64">;
+defm S_ANDN1_WREXEC_B64 : SOP1_Real_gfx10_Renamed_gfx13<0x39, "s_and_not0_wrexec_b64">;
+defm S_ANDN2_WREXEC_B64 : SOP1_Real_gfx10_Renamed_gfx13<0x3a, "s_and_not1_wrexec_b64">;
+defm S_BITREPLICATE_B64_B32 : SOP1_Real_gfx10_gfx13<0x03b>;
+defm S_AND_SAVEEXEC_B32 : SOP1_Real_gfx10_gfx13<0x03c>;
+defm S_OR_SAVEEXEC_B32 : SOP1_Real_gfx10_gfx13<0x03d>;
+defm S_XOR_SAVEEXEC_B32 : SOP1_Real_gfx10_gfx13<0x03e>;
+defm S_ANDN2_SAVEEXEC_B32 : SOP1_Real_gfx10_Renamed_gfx13<0x3f, "s_and_not1_saveexec_b32">;
+defm S_ORN2_SAVEEXEC_B32 : SOP1_Real_gfx10_Renamed_gfx13<0x40, "s_or_not1_saveexec_b32">;
+defm S_NAND_SAVEEXEC_B32 : SOP1_Real_gfx10_gfx13<0x041>;
+defm S_NOR_SAVEEXEC_B32 : SOP1_Real_gfx10_gfx13<0x042>;
+defm S_XNOR_SAVEEXEC_B32 : SOP1_Real_gfx10_gfx13<0x043>;
+defm S_ANDN1_SAVEEXEC_B32 : SOP1_Real_gfx10_Renamed_gfx13<0x44, "s_and_not0_saveexec_b32">;
+defm S_ORN1_SAVEEXEC_B32 : SOP1_Real_gfx10_Renamed_gfx13<0x45, "s_or_not0_saveexec_b32">;
+defm S_ANDN1_WREXEC_B32 : SOP1_Real_gfx10_Renamed_gfx13<0x46, "s_and_not0_wrexec_b32">;
+defm S_ANDN2_WREXEC_B32 : SOP1_Real_gfx10_Renamed_gfx13<0x47, "s_and_not1_wrexec_b32">;
+defm S_MOVRELSD_2_B32 : SOP1_Real_gfx10_gfx13<0x049>;
//===----------------------------------------------------------------------===//
-// SOP1 - GFX6, GFX7, GFX10, GFX11.
+// SOP1 - GFX6, GFX7, GFX10, GFX11, GFX13
//===----------------------------------------------------------------------===//
@@ -2292,61 +2389,82 @@ multiclass SOP1_Real_gfx6_gfx7<bits<8> op> {
multiclass SOP1_Real_gfx6_gfx7_gfx10<bits<8> op> :
SOP1_Real_gfx6_gfx7<op>, SOP1_Real_gfx10<op>;
-multiclass SOP1_Real_gfx6_gfx7_gfx10_gfx11_gfx12<bits<8> op> :
- SOP1_Real_gfx6_gfx7<op>, SOP1_Real_gfx10_gfx11_gfx12<op>;
+multiclass SOP1_Real_gfx6_gfx7_gfx10_gfx13<bits<8> op> :
+ SOP1_Real_gfx6_gfx7<op>, SOP1_Real_gfx10<op>, SOP1_Real_gfx13<op>;
+
+multiclass SOP1_Real_gfx6_gfx7_gfx10_Renamed_gfx13<bits<8> op, string gfx13_name> :
+ SOP1_Real_gfx6_gfx7<op>, SOP1_Real_gfx10<op>, SOP1_Real_gfx13<op, gfx13_name>;
+
+multiclass SOP1_Real_gfx6_gfx7_gfx10_gfx11_gfx12_gfx13<bits<8> op> :
+ SOP1_Real_gfx6_gfx7<op>, SOP1_Real_gfx10<op>, SOP1_Real_gfx11<op>,
+ SOP1_Real_gfx12<op>, SOP1_Real_gfx13<op>;
defm S_CBRANCH_JOIN : SOP1_Real_gfx6_gfx7<0x032>;
-defm S_MOV_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x003>;
-defm S_MOV_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x004>;
-defm S_CMOV_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x005>;
-defm S_CMOV_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x006>;
-defm S_NOT_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x007>;
-defm S_NOT_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x008>;
-defm S_WQM_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x009>;
-defm S_WQM_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x00a>;
-defm S_BREV_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x00b>;
-defm S_BREV_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x00c>;
-defm S_BCNT0_I32_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x00d>;
-defm S_BCNT0_I32_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x00e>;
-defm S_BCNT1_I32_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x00f>;
-defm S_BCNT1_I32_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x010>;
+defm S_MOV_B32 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x003>;
+defm S_MOV_B64 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x004>;
+defm S_CMOV_B32 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x005>;
+defm S_CMOV_B64 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x006>;
+defm S_NOT_B32 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x007>;
+defm S_NOT_B64 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x008>;
+defm S_WQM_B32 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x009>;
+defm S_WQM_B64 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x00a>;
+defm S_BREV_B32 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x00b>;
+defm S_BREV_B64 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x00c>;
+defm S_BCNT0_I32_B32 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x00d>;
+defm S_BCNT0_I32_B64 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x00e>;
+defm S_BCNT1_I32_B32 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x00f>;
+defm S_BCNT1_I32_B64 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x010>;
defm S_FF0_I32_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x011>;
defm S_FF0_I32_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x012>;
-defm S_FF1_I32_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x013>;
-defm S_FF1_I32_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x014>;
-defm S_FLBIT_I32_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x015>;
-defm S_FLBIT_I32_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x016>;
-defm S_FLBIT_I32 : SOP1_Real_gfx6_gfx7_gfx10<0x017>;
-defm S_FLBIT_I32_I64 : SOP1_Real_gfx6_gfx7_gfx10<0x018>;
-defm S_SEXT_I32_I8 : SOP1_Real_gfx6_gfx7_gfx10<0x019>;
-defm S_SEXT_I32_I16 : SOP1_Real_gfx6_gfx7_gfx10<0x01a>;
-defm S_BITSET0_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x01b>;
-defm S_BITSET0_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x01c>;
-defm S_BITSET1_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x01d>;
-defm S_BITSET1_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x01e>;
-defm S_GETPC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x01f>;
-defm S_SETPC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x020>;
-defm S_SWAPPC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x021>;
-defm S_RFE_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x022>;
-defm S_AND_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x024>;
-defm S_OR_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x025>;
-defm S_XOR_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x026>;
-defm S_ANDN2_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x027>;
-defm S_ORN2_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x028>;
-defm S_NAND_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x029>;
-defm S_NOR_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x02a>;
-defm S_XNOR_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x02b>;
-defm S_QUADMASK_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x02c>;
-defm S_QUADMASK_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x02d>;
-defm S_MOVRELS_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x02e>;
-defm S_MOVRELS_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x02f>;
-defm S_MOVRELD_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x030>;
-defm S_MOVRELD_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x031>;
-defm S_ABS_I32 : SOP1_Real_gfx6_gfx7_gfx10<0x034>;
-
-//===----------------------------------------------------------------------===//
-// SOP2 - GFX12
+defm S_FF1_I32_B32 : SOP1_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x013, "s_ctz_i32_b32">;
+defm S_FF1_I32_B64 : SOP1_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x014, "s_ctz_i32_b64">;
+defm S_FLBIT_I32_B32 : SOP1_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x015, "s_clz_i32_u32">;
+defm S_FLBIT_I32_B64 : SOP1_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x016, "s_clz_i32_u64">;
+defm S_FLBIT_I32 : SOP1_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x017, "s_cls_i32">;
+defm S_FLBIT_I32_I64 : SOP1_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x018, "s_cls_i32_i64">;
+defm S_SEXT_I32_I8 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x019>;
+defm S_SEXT_I32_I16 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x01a>;
+defm S_BITSET0_B32 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x01b>;
+defm S_BITSET0_B64 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x01c>;
+defm S_BITSET1_B32 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x01d>;
+defm S_BITSET1_B64 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x01e>;
+defm S_GETPC_B64 : SOP1_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x01f, "s_get_pc_i64">;
+defm S_SETPC_B64 : SOP1_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x020, "s_set_pc_i64">;
+defm S_SWAPPC_B64 : SOP1_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x021, "s_swap_pc_i64">;
+defm S_RFE_B64 : SOP1_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x022, "s_rfe_i64">;
+defm S_AND_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x024>;
+defm S_OR_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x025>;
+defm S_XOR_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x026>;
+defm S_ANDN2_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x027, "s_and_not1_saveexec_b64">;
+defm S_ORN2_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x028, "s_or_not1_saveexec_b64">;
+defm S_NAND_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x029>;
+defm S_NOR_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x02a>;
+defm S_XNOR_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10_gfx11_gfx12_gfx13<0x02b>;
+defm S_QUADMASK_B32 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x02c>;
+defm S_QUADMASK_B64 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x02d>;
+defm S_MOVRELS_B32 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x02e>;
+defm S_MOVRELS_B64 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x02f>;
+defm S_MOVRELD_B32 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x030>;
+defm S_MOVRELD_B64 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x031>;
+defm S_ABS_I32 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x034>;
+
+//===----------------------------------------------------------------------===//
+// SOP2 - GFX13.
+//===----------------------------------------------------------------------===//
+
+multiclass SOP2_Real_gfx13<bits<7> op, string name = !tolower(NAME)> {
+ defvar ps = !cast<SOP2_Pseudo>(NAME);
+ def _gfx13 : SOP2_Real32<op, ps, name>,
+ Select<GFX13Gen, ps.Mnemonic>;
+ if !ne(ps.Mnemonic, name) then
+ def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX13Only]>;
+}
+
+defm S_PACK_HL_B32_B16 : SOP2_Real_gfx13<0x37>;
+
+//===----------------------------------------------------------------------===//
+// SOP2 - GFX12, GFX13
//===----------------------------------------------------------------------===//
multiclass SOP2_Real_gfx12<bits<7> op, string name = !tolower(NAME)> {
@@ -2355,17 +2473,23 @@ multiclass SOP2_Real_gfx12<bits<7> op, string name = !tolower(NAME)> {
Select<GFX12Gen, ps.PseudoInstr>;
if !ne(ps.Mnemonic, name) then
def : AMDGPUMnemonicAlias<ps.Mnemonic, name> {
- let AssemblerPredicate = isGFX12Plus;
+ let AssemblerPredicate = isGFX12Only;
}
}
-defm S_MINIMUM_F32 : SOP2_Real_gfx12<0x04f>;
-defm S_MAXIMUM_F32 : SOP2_Real_gfx12<0x050>;
-defm S_MINIMUM_F16 : SOP2_Real_gfx12<0x051>;
-defm S_MAXIMUM_F16 : SOP2_Real_gfx12<0x052>;
+multiclass SOP2_Real_gfx12_gfx13<bits<7> op, string name = !tolower(NAME)> :
+ SOP2_Real_gfx12<op, name>, SOP2_Real_gfx13<op, name>;
+
+defm S_MINIMUM_F32 : SOP2_Real_gfx12_gfx13<0x04f>;
+defm S_MAXIMUM_F32 : SOP2_Real_gfx12_gfx13<0x050>;
+defm S_MINIMUM_F16 : SOP2_Real_gfx12_gfx13<0x051>;
+defm S_MAXIMUM_F16 : SOP2_Real_gfx12_gfx13<0x052>;
+defm S_ADD_U64 : SOP2_Real_gfx12_gfx13<0x053, "s_add_nc_u64">;
+defm S_SUB_U64 : SOP2_Real_gfx12_gfx13<0x054, "s_sub_nc_u64">;
+defm S_MUL_U64 : SOP2_Real_gfx12_gfx13<0x055>;
//===----------------------------------------------------------------------===//
-// SOP2 - GFX11, GFX12.
+// SOP2 - GFX11, GFX12, GFX13
//===----------------------------------------------------------------------===//
multiclass SOP2_Real_gfx11<bits<7> op, string name = !tolower(NAME)> {
@@ -2424,14 +2548,19 @@ defm S_MUL_HI_I32 : SOP2_Real_gfx11_gfx12<0x02e>;
defm S_CSELECT_B32 : SOP2_Real_gfx11_gfx12<0x030>;
defm S_CSELECT_B64 : SOP2_Real_gfx11_gfx12<0x031>;
defm S_PACK_HL_B32_B16 : SOP2_Real_gfx11_gfx12<0x035>;
-defm S_ADD_U64 : SOP2_Real_gfx12<0x053, "s_add_nc_u64">;
-defm S_SUB_U64 : SOP2_Real_gfx12<0x054, "s_sub_nc_u64">;
-defm S_MUL_U64 : SOP2_Real_gfx12<0x055>;
//===----------------------------------------------------------------------===//
-// SOP2 - GFX1150, GFX12
+// SOP2 - GFX1150, GFX12, GFX13
//===----------------------------------------------------------------------===//
+multiclass SOP2_Real_gfx11_gfx12_gfx13<bits<7> op> :
+ SOP2_Real_gfx11<op>, SOP2_Real_gfx12<op>, SOP2_Real_gfx13<op>;
+
+multiclass SOP2_Real_FMAK_gfx13<bits<7> op> {
+ def _gfx13 : SOP2_Real64<op, !cast<SOP2_Pseudo>(NAME)>,
+ Select<GFX13Gen, !cast<SOP2_Pseudo>(NAME).Mnemonic>;
+}
+
multiclass SOP2_Real_FMAK_gfx12<bits<7> op> {
def _gfx12 : SOP2_Real64<op, !cast<SOP2_Pseudo>(NAME)>,
Select<GFX12Gen, !cast<SOP2_Pseudo>(NAME).PseudoInstr>;
@@ -2442,35 +2571,36 @@ multiclass SOP2_Real_FMAK_gfx11<bits<7> op> {
Select<GFX11Gen, !cast<SOP2_Pseudo>(NAME).PseudoInstr>;
}
-multiclass SOP2_Real_FMAK_gfx11_gfx12<bits<7> op> :
- SOP2_Real_FMAK_gfx11<op>, SOP2_Real_FMAK_gfx12<op>;
+multiclass SOP2_Real_FMAK_gfx11_gfx12_gfx13<bits<7> op> :
+ SOP2_Real_FMAK_gfx11<op>, SOP2_Real_FMAK_gfx12<op>, SOP2_Real_FMAK_gfx13<op>;
-defm S_ADD_F32 : SOP2_Real_gfx11_gfx12<0x040>;
-defm S_SUB_F32 : SOP2_Real_gfx11_gfx12<0x041>;
-defm S_MUL_F32 : SOP2_Real_gfx11_gfx12<0x044>;
-defm S_FMAAK_F32 : SOP2_Real_FMAK_gfx11_gfx12<0x045>;
-defm S_FMAMK_F32 : SOP2_Real_FMAK_gfx11_gfx12<0x046>;
-defm S_FMAC_F32 : SOP2_Real_gfx11_gfx12<0x047>;
-defm S_CVT_PK_RTZ_F16_F32 : SOP2_Real_gfx11_gfx12<0x048>;
-defm S_ADD_F16 : SOP2_Real_gfx11_gfx12<0x049>;
-defm S_SUB_F16 : SOP2_Real_gfx11_gfx12<0x04a>;
-defm S_MUL_F16 : SOP2_Real_gfx11_gfx12<0x04d>;
-defm S_FMAC_F16 : SOP2_Real_gfx11_gfx12<0x04e>;
+defm S_ADD_F32 : SOP2_Real_gfx11_gfx12_gfx13<0x040>;
+defm S_SUB_F32 : SOP2_Real_gfx11_gfx12_gfx13<0x041>;
+defm S_MUL_F32 : SOP2_Real_gfx11_gfx12_gfx13<0x044>;
+defm S_FMAAK_F32 : SOP2_Real_FMAK_gfx11_gfx12_gfx13<0x045>;
+defm S_FMAMK_F32 : SOP2_Real_FMAK_gfx11_gfx12_gfx13<0x046>;
+defm S_FMAC_F32 : SOP2_Real_gfx11_gfx12_gfx13<0x047>;
+defm S_CVT_PK_RTZ_F16_F32 : SOP2_Real_gfx11_gfx12_gfx13<0x048>;
+defm S_ADD_F16 : SOP2_Real_gfx11_gfx12_gfx13<0x049>;
+defm S_SUB_F16 : SOP2_Real_gfx11_gfx12_gfx13<0x04a>;
+defm S_MUL_F16 : SOP2_Real_gfx11_gfx12_gfx13<0x04d>;
+defm S_FMAC_F16 : SOP2_Real_gfx11_gfx12_gfx13<0x04e>;
//===----------------------------------------------------------------------===//
-// SOP2 - GFX1150
+// SOP2 - GFX1150, GFX12, GFX13
//===----------------------------------------------------------------------===//
-multiclass SOP2_Real_gfx11_Renamed_gfx12<bits<7> op, string gfx12_name> :
- SOP2_Real_gfx11<op>, SOP2_Real_gfx12<op, gfx12_name>;
+multiclass SOP2_Real_gfx11_Renamed_gfx12_gfx13<bits<7> op, string gfx12_gfx13_name> :
+ SOP2_Real_gfx11<op>, SOP2_Real_gfx12<op, gfx12_gfx13_name>,
+ SOP2_Real_gfx13<op, gfx12_gfx13_name>;
-defm S_MIN_F32 : SOP2_Real_gfx11_Renamed_gfx12<0x042, "s_min_num_f32">;
-defm S_MAX_F32 : SOP2_Real_gfx11_Renamed_gfx12<0x043, "s_max_num_f32">;
-defm S_MIN_F16 : SOP2_Real_gfx11_Renamed_gfx12<0x04b, "s_min_num_f16">;
-defm S_MAX_F16 : SOP2_Real_gfx11_Renamed_gfx12<0x04c, "s_max_num_f16">;
+defm S_MIN_F32 : SOP2_Real_gfx11_Renamed_gfx12_gfx13<0x042, "s_min_num_f32">;
+defm S_MAX_F32 : SOP2_Real_gfx11_Renamed_gfx12_gfx13<0x043, "s_max_num_f32">;
+defm S_MIN_F16 : SOP2_Real_gfx11_Renamed_gfx12_gfx13<0x04b, "s_min_num_f16">;
+defm S_MAX_F16 : SOP2_Real_gfx11_Renamed_gfx12_gfx13<0x04c, "s_max_num_f16">;
//===----------------------------------------------------------------------===//
-// SOP2 - GFX10.
+// SOP2 - GFX10, GFX11, GFX12, GFX13
//===----------------------------------------------------------------------===//
multiclass SOP2_Real_gfx10<bits<7> op> {
@@ -2479,21 +2609,25 @@ multiclass SOP2_Real_gfx10<bits<7> op> {
Select<GFX10Gen, ps.PseudoInstr>;
}
-multiclass SOP2_Real_gfx10_gfx11_gfx12<bits<7> op> :
- SOP2_Real_gfx10<op>, SOP2_Real_gfx11_gfx12<op>;
+multiclass SOP2_Real_gfx10_gfx13<bits<7> op> :
+ SOP2_Real_gfx10<op>, SOP2_Real_gfx13<op>;
-defm S_LSHL1_ADD_U32 : SOP2_Real_gfx10<0x02e>;
-defm S_LSHL2_ADD_U32 : SOP2_Real_gfx10<0x02f>;
-defm S_LSHL3_ADD_U32 : SOP2_Real_gfx10<0x030>;
-defm S_LSHL4_ADD_U32 : SOP2_Real_gfx10<0x031>;
-defm S_PACK_LL_B32_B16 : SOP2_Real_gfx10_gfx11_gfx12<0x032>;
-defm S_PACK_LH_B32_B16 : SOP2_Real_gfx10_gfx11_gfx12<0x033>;
-defm S_PACK_HH_B32_B16 : SOP2_Real_gfx10_gfx11_gfx12<0x034>;
-defm S_MUL_HI_U32 : SOP2_Real_gfx10<0x035>;
-defm S_MUL_HI_I32 : SOP2_Real_gfx10<0x036>;
+multiclass SOP2_Real_gfx10_gfx11_gfx12_gfx13<bits<7> op> :
+ SOP2_Real_gfx10<op>, SOP2_Real_gfx11<op>, SOP2_Real_gfx12<op>,
+ SOP2_Real_gfx13<op>;
+
+defm S_LSHL1_ADD_U32 : SOP2_Real_gfx10_gfx13<0x02e>;
+defm S_LSHL2_ADD_U32 : SOP2_Real_gfx10_gfx13<0x02f>;
+defm S_LSHL3_ADD_U32 : SOP2_Real_gfx10_gfx13<0x030>;
+defm S_LSHL4_ADD_U32 : SOP2_Real_gfx10_gfx13<0x031>;
+defm S_PACK_LL_B32_B16 : SOP2_Real_gfx10_gfx11_gfx12_gfx13<0x032>;
+defm S_PACK_LH_B32_B16 : SOP2_Real_gfx10_gfx11_gfx12_gfx13<0x033>;
+defm S_PACK_HH_B32_B16 : SOP2_Real_gfx10_gfx11_gfx12_gfx13<0x034>;
+defm S_MUL_HI_U32 : SOP2_Real_gfx10_gfx13<0x035>;
+defm S_MUL_HI_I32 : SOP2_Real_gfx10_gfx13<0x036>;
//===----------------------------------------------------------------------===//
-// SOP2 - GFX6, GFX7.
+// SOP2 - GFX6, GFX7, GFX10, GFX11, GFX12, GFX13
//===----------------------------------------------------------------------===//
multiclass SOP2_Real_gfx6_gfx7<bits<7> op> {
@@ -2502,57 +2636,105 @@ multiclass SOP2_Real_gfx6_gfx7<bits<7> op> {
Select_gfx6_gfx7<ps.PseudoInstr>;
}
-multiclass SOP2_Real_gfx6_gfx7_gfx10<bits<7> op> :
- SOP2_Real_gfx6_gfx7<op>, SOP2_Real_gfx10<op>;
+multiclass SOP2_Real_gfx6_gfx7_gfx10_gfx13<bits<7> op> :
+ SOP2_Real_gfx6_gfx7<op>, SOP2_Real_gfx10<op>, SOP2_Real_gfx13<op>;
+
+multiclass SOP2_Real_gfx6_gfx7_gfx10_Renamed_gfx13<bits<7> op, string gfx13_name> :
+ SOP2_Real_gfx6_gfx7<op>, SOP2_Real_gfx10<op>, SOP2_Real_gfx13<op, gfx13_name>;
-multiclass SOP2_Real_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12<bits<7> op, string gfx12_name> :
+multiclass SOP2_Real_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12_gfx13<bits<7> op, string gfx12_gfx13_name> :
SOP2_Real_gfx6_gfx7<op>, SOP2_Real_gfx10<op>, SOP2_Real_gfx11<op>,
- SOP2_Real_gfx12<op, gfx12_name>;
+ SOP2_Real_gfx12<op, gfx12_gfx13_name>, SOP2_Real_gfx13<op, gfx12_gfx13_name>;
defm S_CBRANCH_G_FORK : SOP2_Real_gfx6_gfx7<0x02b>;
-defm S_ADD_U32 : SOP2_Real_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12<0x000, "s_add_co_u32">;
-defm S_SUB_U32 : SOP2_Real_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12<0x001, "s_sub_co_u32">;
-defm S_ADD_I32 : SOP2_Real_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12<0x002, "s_add_co_i32">;
-defm S_SUB_I32 : SOP2_Real_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12<0x003, "s_sub_co_i32">;
-defm S_ADDC_U32 : SOP2_Real_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12<0x004, "s_add_co_ci_u32">;
-defm S_SUBB_U32 : SOP2_Real_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12<0x005, "s_sub_co_ci_u32">;
-defm S_MIN_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x006>;
-defm S_MIN_U32 : SOP2_Real_gfx6_gfx7_gfx10<0x007>;
-defm S_MAX_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x008>;
-defm S_MAX_U32 : SOP2_Real_gfx6_gfx7_gfx10<0x009>;
-defm S_CSELECT_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x00a>;
-defm S_CSELECT_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x00b>;
-defm S_AND_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x00e>;
-defm S_AND_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x00f>;
-defm S_OR_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x010>;
-defm S_OR_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x011>;
-defm S_XOR_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x012>;
-defm S_XOR_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x013>;
-defm S_ANDN2_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x014>;
-defm S_ANDN2_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x015>;
-defm S_ORN2_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x016>;
-defm S_ORN2_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x017>;
-defm S_NAND_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x018>;
-defm S_NAND_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x019>;
-defm S_NOR_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x01a>;
-defm S_NOR_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x01b>;
-defm S_XNOR_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x01c>;
-defm S_XNOR_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x01d>;
-defm S_LSHL_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x01e>;
-defm S_LSHL_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x01f>;
-defm S_LSHR_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x020>;
-defm S_LSHR_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x021>;
-defm S_ASHR_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x022>;
-defm S_ASHR_I64 : SOP2_Real_gfx6_gfx7_gfx10<0x023>;
-defm S_BFM_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x024>;
-defm S_BFM_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x025>;
-defm S_MUL_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x026>;
-defm S_BFE_U32 : SOP2_Real_gfx6_gfx7_gfx10<0x027>;
-defm S_BFE_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x028>;
-defm S_BFE_U64 : SOP2_Real_gfx6_gfx7_gfx10<0x029>;
-defm S_BFE_I64 : SOP2_Real_gfx6_gfx7_gfx10<0x02a>;
-defm S_ABSDIFF_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x02c>;
+defm S_ADD_U32 : SOP2_Real_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12_gfx13<0x000, "s_add_co_u32">;
+defm S_SUB_U32 : SOP2_Real_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12_gfx13<0x001, "s_sub_co_u32">;
+defm S_ADD_I32 : SOP2_Real_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12_gfx13<0x002, "s_add_co_i32">;
+defm S_SUB_I32 : SOP2_Real_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12_gfx13<0x003, "s_sub_co_i32">;
+defm S_ADDC_U32 : SOP2_Real_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12_gfx13<0x004, "s_add_co_ci_u32">;
+defm S_SUBB_U32 : SOP2_Real_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12_gfx13<0x005, "s_sub_co_ci_u32">;
+defm S_MIN_I32 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x006>;
+defm S_MIN_U32 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x007>;
+defm S_MAX_I32 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x008>;
+defm S_MAX_U32 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x009>;
+defm S_CSELECT_B32 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x00a>;
+defm S_CSELECT_B64 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x00b>;
+defm S_AND_B32 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x00e>;
+defm S_AND_B64 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x00f>;
+defm S_OR_B32 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x010>;
+defm S_OR_B64 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x011>;
+defm S_XOR_B32 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x012>;
+defm S_XOR_B64 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x013>;
+defm S_ANDN2_B32 : SOP2_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x014, "s_and_not1_b32">;
+defm S_ANDN2_B64 : SOP2_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x015, "s_and_not1_b64">;
+defm S_ORN2_B32 : SOP2_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x016, "s_or_not1_b32">;
+defm S_ORN2_B64 : SOP2_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x017, "s_or_not1_b64">;
+defm S_NAND_B32 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x018>;
+defm S_NAND_B64 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x019>;
+defm S_NOR_B32 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x01a>;
+defm S_NOR_B64 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x01b>;
+defm S_XNOR_B32 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x01c>;
+defm S_XNOR_B64 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x01d>;
+defm S_LSHL_B32 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x01e>;
+defm S_LSHL_B64 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x01f>;
+defm S_LSHR_B32 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x020>;
+defm S_LSHR_B64 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x021>;
+defm S_ASHR_I32 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x022>;
+defm S_ASHR_I64 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x023>;
+defm S_BFM_B32 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x024>;
+defm S_BFM_B64 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x025>;
+defm S_MUL_I32 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x026>;
+defm S_BFE_U32 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x027>;
+defm S_BFE_I32 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x028>;
+defm S_BFE_U64 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x029>;
+defm S_BFE_I64 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x02a>;
+defm S_ABSDIFF_I32 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x02c>;
+
+
+//===----------------------------------------------------------------------===//
+// SOPK - GFX10 Only
+//===----------------------------------------------------------------------===//
+
+multiclass SOPK_Real32_gfx10<bits<5> op> {
+ defvar ps = !cast<SOPK_Pseudo>(NAME);
+ def _gfx10 : SOPK_Real32<op, ps>,
+ Select<GFX10Gen, ps.PseudoInstr>;
+}
+
+multiclass SOPK_Real64_gfx10<bits<5> op> {
+ defvar ps = !cast<SOPK_Pseudo>(NAME);
+ def _gfx10 : SOPK_Real64<op, ps>,
+ Select<GFX10Gen, ps.PseudoInstr>;
+}
+
+defm S_WAITCNT_VSCNT : SOPK_Real32_gfx10<0x017>;
+defm S_WAITCNT_VMCNT : SOPK_Real32_gfx10<0x018>;
+defm S_WAITCNT_EXPCNT : SOPK_Real32_gfx10<0x019>;
+defm S_WAITCNT_LGKMCNT : SOPK_Real32_gfx10<0x01a>;
+defm S_SUBVECTOR_LOOP_BEGIN : SOPK_Real32_gfx10<0x01b>;
+defm S_SUBVECTOR_LOOP_END : SOPK_Real32_gfx10<0x01c>;
+
+//===----------------------------------------------------------------------===//
+// SOPK - GFX11 Only
+//===----------------------------------------------------------------------===//
+
+multiclass SOPK_Real32_gfx11<bits<5> op> {
+ def _gfx11 : SOPK_Real32<op, !cast<SOPK_Pseudo>(NAME)>,
+ Select<GFX11Gen, !cast<SOPK_Pseudo>(NAME).PseudoInstr>;
+}
+
+multiclass SOPK_Real64_gfx11<bits<5> op> {
+ def _gfx11 : SOPK_Real64<op, !cast<SOPK_Pseudo>(NAME)>,
+ Select<GFX11Gen, !cast<SOPK_Pseudo>(NAME).PseudoInstr>;
+}
+
+defm S_SUBVECTOR_LOOP_BEGIN : SOPK_Real32_gfx11<0x016>;
+defm S_SUBVECTOR_LOOP_END : SOPK_Real32_gfx11<0x017>;
+defm S_WAITCNT_VSCNT : SOPK_Real32_gfx11<0x018>;
+defm S_WAITCNT_VMCNT : SOPK_Real32_gfx11<0x019>;
+defm S_WAITCNT_EXPCNT : SOPK_Real32_gfx11<0x01a>;
+defm S_WAITCNT_LGKMCNT : SOPK_Real32_gfx11<0x01b>;
//===----------------------------------------------------------------------===//
// SOPK - GFX11, GFX12.
@@ -2568,21 +2750,11 @@ multiclass SOPK_Real32_gfx12<bits<5> op, string name = !tolower(NAME)> {
}
}
-multiclass SOPK_Real32_gfx11<bits<5> op> {
- def _gfx11 : SOPK_Real32<op, !cast<SOPK_Pseudo>(NAME)>,
- Select<GFX11Gen, !cast<SOPK_Pseudo>(NAME).PseudoInstr>;
-}
-
multiclass SOPK_Real64_gfx12<bits<5> op> {
def _gfx12 : SOPK_Real64<op, !cast<SOPK_Pseudo>(NAME)>,
Select<GFX12Gen, !cast<SOPK_Pseudo>(NAME).PseudoInstr>;
}
-multiclass SOPK_Real64_gfx11<bits<5> op> {
- def _gfx11 : SOPK_Real64<op, !cast<SOPK_Pseudo>(NAME)>,
- Select<GFX11Gen, !cast<SOPK_Pseudo>(NAME).PseudoInstr>;
-}
-
multiclass SOPK_Real32_gfx11_gfx12<bits<5> op> :
SOPK_Real32_gfx11<op>, SOPK_Real32_gfx12<op>;
@@ -2604,43 +2776,39 @@ defm S_SETREG_IMM32_B32 : SOPK_Real64_gfx11_gfx12<0x013>;
let OtherPredicates = [isNotGFX1250Plus] in
defm S_CALL_B64 : SOPK_Real32_gfx11_gfx12<0x014>;
defm S_CALL_B64 : SOPK_Real32_gfx1250<0x014, "s_call_i64">;
-defm S_SUBVECTOR_LOOP_BEGIN : SOPK_Real32_gfx11<0x016>;
-defm S_SUBVECTOR_LOOP_END : SOPK_Real32_gfx11<0x017>;
-defm S_WAITCNT_VSCNT : SOPK_Real32_gfx11<0x018>;
-defm S_WAITCNT_VMCNT : SOPK_Real32_gfx11<0x019>;
-defm S_WAITCNT_EXPCNT : SOPK_Real32_gfx11<0x01a>;
-defm S_WAITCNT_LGKMCNT : SOPK_Real32_gfx11<0x01b>;
//===----------------------------------------------------------------------===//
-// SOPK - GFX10.
+// SOPK - GFX10, GFX11, GFX12, GFX13.
//===----------------------------------------------------------------------===//
-multiclass SOPK_Real32_gfx10<bits<5> op> {
+multiclass SOPK_Real32_gfx13<bits<5> op, string name = !tolower(NAME)> {
defvar ps = !cast<SOPK_Pseudo>(NAME);
- def _gfx10 : SOPK_Real32<op, ps>,
- Select<GFX10Gen, ps.PseudoInstr>;
+ def _gfx13 : SOPK_Real32<op, ps, name>,
+ Select<GFX13Gen, ps.Mnemonic>;
+ if !ne(ps.Mnemonic, name) then
+ def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX13Only]>;
}
-multiclass SOPK_Real64_gfx10<bits<5> op> {
+multiclass SOPK_Real64_gfx13<bits<5> op> {
defvar ps = !cast<SOPK_Pseudo>(NAME);
- def _gfx10 : SOPK_Real64<op, ps>,
- Select<GFX10Gen, ps.PseudoInstr>;
+ def _gfx13 : SOPK_Real64<op, ps>,
+ Select<GFX13Gen, ps.Mnemonic>;
}
-multiclass SOPK_Real32_gfx10_gfx11<bits<5> op> :
- SOPK_Real32_gfx10<op>, SOPK_Real32_gfx11<op>;
+multiclass SOPK_Real32_gfx10_gfx11_gfx12_gfx13<bits<5> op> :
+ SOPK_Real32_gfx10<op>, SOPK_Real32_gfx11<op>, SOPK_Real32_gfx12<op>,
+ SOPK_Real32_gfx13<op>;
-multiclass SOPK_Real32_gfx10_gfx11_gfx12<bits<5> op> :
- SOPK_Real32_gfx10<op>, SOPK_Real32_gfx11_gfx12<op>;
+defm S_VERSION : SOPK_Real32_gfx10_gfx11_gfx12_gfx13<0x001>;
-defm S_VERSION : SOPK_Real32_gfx10_gfx11_gfx12<0x001>;
-defm S_CALL_B64 : SOPK_Real32_gfx10<0x016>;
-defm S_WAITCNT_VSCNT : SOPK_Real32_gfx10<0x017>;
-defm S_WAITCNT_VMCNT : SOPK_Real32_gfx10<0x018>;
-defm S_WAITCNT_EXPCNT : SOPK_Real32_gfx10<0x019>;
-defm S_WAITCNT_LGKMCNT : SOPK_Real32_gfx10<0x01a>;
-defm S_SUBVECTOR_LOOP_BEGIN : SOPK_Real32_gfx10<0x01b>;
-defm S_SUBVECTOR_LOOP_END : SOPK_Real32_gfx10<0x01c>;
+//===----------------------------------------------------------------------===//
+// SOPK - GFX10, GFX13.
+//===----------------------------------------------------------------------===//
+
+multiclass SOPK_Real32_gfx10_Renamed_gfx13<bits<5> op, string gfx13_name> :
+ SOPK_Real32_gfx10<op>, SOPK_Real32_gfx13<op, gfx13_name>;
+
+defm S_CALL_B64 : SOPK_Real32_gfx10_Renamed_gfx13<0x016, "s_call_i64">;
//===----------------------------------------------------------------------===//
// SOPK - GFX6, GFX7.
@@ -2652,32 +2820,15 @@ multiclass SOPK_Real32_gfx6_gfx7<bits<5> op> {
Select_gfx6_gfx7<ps.PseudoInstr>;
}
-multiclass SOPK_Real64_gfx6_gfx7<bits<5> op> {
- defvar ps = !cast<SOPK_Pseudo>(NAME);
- def _gfx6_gfx7 : SOPK_Real64<op, ps>,
- Select_gfx6_gfx7<ps.PseudoInstr>;
-}
-
-multiclass SOPK_Real32_gfx6_gfx7_gfx10<bits<5> op> :
- SOPK_Real32_gfx6_gfx7<op>, SOPK_Real32_gfx10<op>;
+defm S_CBRANCH_I_FORK : SOPK_Real32_gfx6_gfx7<0x011>;
-multiclass SOPK_Real64_gfx6_gfx7_gfx10<bits<5> op> :
- SOPK_Real64_gfx6_gfx7<op>, SOPK_Real64_gfx10<op>;
+//===----------------------------------------------------------------------===//
+// SOPK - GFX6, GFX7, GFX10, GFX11.
+//===----------------------------------------------------------------------===//
multiclass SOPK_Real32_gfx6_gfx7_gfx10_gfx11<bits<5> op> :
- SOPK_Real32_gfx6_gfx7<op>, SOPK_Real32_gfx10_gfx11<op>;
+ SOPK_Real32_gfx6_gfx7<op>, SOPK_Real32_gfx10<op>, SOPK_Real32_gfx11<op>;
-multiclass SOPK_Real32_gfx6_gfx7_gfx10_gfx11_gfx12<bits<5> op> :
- SOPK_Real32_gfx6_gfx7<op>, SOPK_Real32_gfx10_gfx11_gfx12<op>;
-
-multiclass SOPK_Real32_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12<bits<5> op, string gfx12_name> :
- SOPK_Real32_gfx6_gfx7<op>, SOPK_Real32_gfx10<op>, SOPK_Real32_gfx11<op>,
- SOPK_Real32_gfx12<op, gfx12_name>;
-
-defm S_CBRANCH_I_FORK : SOPK_Real32_gfx6_gfx7<0x011>;
-
-defm S_MOVK_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11_gfx12<0x000>;
-defm S_CMOVK_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11_gfx12<0x002>;
defm S_CMPK_EQ_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x003>;
defm S_CMPK_LG_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x004>;
defm S_CMPK_GT_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x005>;
@@ -2690,11 +2841,71 @@ defm S_CMPK_GT_U32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x00b>;
defm S_CMPK_GE_U32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x00c>;
defm S_CMPK_LT_U32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x00d>;
defm S_CMPK_LE_U32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x00e>;
-defm S_ADDK_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12<0x00f, "s_addk_co_i32">;
-defm S_MULK_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11_gfx12<0x010>;
-defm S_GETREG_B32 : SOPK_Real32_gfx6_gfx7_gfx10<0x012>;
-defm S_SETREG_B32 : SOPK_Real32_gfx6_gfx7_gfx10<0x013>;
-defm S_SETREG_IMM32_B32 : SOPK_Real64_gfx6_gfx7_gfx10<0x015>;
+
+//===----------------------------------------------------------------------===//
+// SOPK - GFX6, GFX7, GFX10, GFX13.
+//===----------------------------------------------------------------------===//
+
+multiclass SOPK_Real64_gfx6_gfx7<bits<5> op> {
+ defvar ps = !cast<SOPK_Pseudo>(NAME);
+ def _gfx6_gfx7 : SOPK_Real64<op, ps>,
+ Select_gfx6_gfx7<ps.PseudoInstr>;
+}
+
+multiclass SOPK_Real32_gfx6_gfx7_gfx10_gfx13<bits<5> op> :
+ SOPK_Real32_gfx6_gfx7<op>, SOPK_Real32_gfx10<op>, SOPK_Real32_gfx13<op>;
+
+multiclass SOPK_Real64_gfx6_gfx7_gfx10_gfx13<bits<5> op> :
+ SOPK_Real64_gfx6_gfx7<op>, SOPK_Real64_gfx10<op>, SOPK_Real64_gfx13<op>;
+
+defm S_GETREG_B32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx13<0x012>;
+defm S_SETREG_B32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx13<0x013>;
+defm S_SETREG_IMM32_B32 : SOPK_Real64_gfx6_gfx7_gfx10_gfx13<0x015>;
+
+//===----------------------------------------------------------------------===//
+// SOPK - GFX6, GFX7, GFX10, GFX11, GFX12, GFX13.
+//===----------------------------------------------------------------------===//
+
+multiclass SOPK_Real32_gfx6_gfx7_gfx10_gfx11_gfx12_gfx13<bits<5> op> :
+ SOPK_Real32_gfx6_gfx7<op>, SOPK_Real32_gfx10<op>, SOPK_Real32_gfx11<op>,
+ SOPK_Real32_gfx12<op>, SOPK_Real32_gfx13<op>;
+
+multiclass SOPK_Real32_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12_gfx13<bits<5> op, string gfx12_gfx13_name> :
+ SOPK_Real32_gfx6_gfx7<op>, SOPK_Real32_gfx10<op>, SOPK_Real32_gfx11<op>,
+ SOPK_Real32_gfx12<op, gfx12_gfx13_name>, SOPK_Real32_gfx13<op, gfx12_gfx13_name>;
+
+defm S_MOVK_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11_gfx12_gfx13<0x000>;
+defm S_CMOVK_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11_gfx12_gfx13<0x002>;
+defm S_ADDK_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12_gfx13<0x00f, "s_addk_co_i32">;
+defm S_MULK_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11_gfx12_gfx13<0x010>;
+
+
+//===----------------------------------------------------------------------===//
+// SOPP - GFX13 only
+//===----------------------------------------------------------------------===//
+
+multiclass SOPP_Real_32_gfx13<bits<7> op, string name = !cast<SOPP_Pseudo>(NAME).Mnemonic, bit compat_alias = 1> {
+ defvar ps = !cast<SOPP_Pseudo>(NAME);
+ def _gfx13 : SOPP_Real_32<op, ps, name>,
+ Select<GFX13Gen, ps.Mnemonic>,
+ SOPPRelaxTable<0, ps.KeyName, "_gfx13">;
+ if !and(compat_alias, !ne(ps.Mnemonic, name)) then
+ def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX13Only]>;
+}
+
+multiclass SOPP_Real_64_gfx13<bits<7> op> {
+ defvar ps = !cast<SOPP_Pseudo>(NAME);
+ def _gfx13 : SOPP_Real_64<op, ps, ps.Mnemonic>,
+ Select<GFX13Gen, ps.Mnemonic>,
+ SOPPRelaxTable<1, ps.KeyName, "_gfx13">;
+}
+
+defm S_WAKEUP_imm : SOPP_Real_32_gfx13<0x003>;
+defm S_BARRIER_WAIT : SOPP_Real_32_gfx13<0x2b>;
+defm S_MONITOR_SLEEP : SOPP_Real_32_gfx13<0x2c>;
+defm S_DELAY_ALU : SOPP_Real_32_gfx13<0x2e>;
+defm S_WAIT_EVENT : SOPP_Real_32_gfx13<0x2f>;
+defm S_BARRIER_LEAVE_IMM : SOPP_Real_32_gfx13<0x31>;
//===----------------------------------------------------------------------===//
// SOPP - GFX12 only.
@@ -2706,35 +2917,23 @@ multiclass SOPP_Real_32_gfx12<bits<7> op, string name = !tolower(NAME)> {
Select<GFX12Gen, ps.PseudoInstr>;
if !ne(ps.Mnemonic, name) then
def : AMDGPUMnemonicAlias<ps.Mnemonic, name> {
- let AssemblerPredicate = isGFX12Plus;
+ let AssemblerPredicate = isGFX12Only;
}
}
+multiclass SOPP_Real_64_gfx12<bits<7> op> {
+ def _gfx12 : SOPP_Real_64<op, !cast<SOPP_Pseudo>(NAME), !cast<SOPP_Pseudo>(NAME).Mnemonic>,
+ Select<GFX12Gen, !cast<SOPP_Pseudo>(NAME).PseudoInstr>,
+ SOPPRelaxTable<1, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx12">;
+}
+
defm S_BARRIER_WAIT : SOPP_Real_32_gfx12<0x014>;
defm S_BARRIER_LEAVE : SOPP_Real_32_gfx12<0x015>;
-defm S_WAIT_LOADCNT : SOPP_Real_32_gfx12<0x040>;
-defm S_WAIT_STORECNT : SOPP_Real_32_gfx12<0x041>;
-defm S_WAIT_SAMPLECNT : SOPP_Real_32_gfx12<0x042>;
defm S_WAIT_BVHCNT : SOPP_Real_32_gfx12<0x043>;
-defm S_WAIT_EXPCNT : SOPP_Real_32_gfx12<0x044>;
-defm S_WAIT_DSCNT : SOPP_Real_32_gfx12<0x046>;
-defm S_WAIT_KMCNT : SOPP_Real_32_gfx12<0x047>;
-defm S_WAIT_LOADCNT_DSCNT : SOPP_Real_32_gfx12<0x048>;
-defm S_WAIT_STORECNT_DSCNT : SOPP_Real_32_gfx12<0x049>;
//===----------------------------------------------------------------------===//
-// SOPP - GFX1250 only.
+// SOPP - GFX11 only.
//===----------------------------------------------------------------------===//
-defm S_SET_VGPR_MSB : SOPP_Real_32_gfx12<0x006>;
-defm S_SETPRIO_INC_WG : SOPP_Real_32_gfx12<0x03e>;
-defm S_WAIT_XCNT : SOPP_Real_32_gfx12<0x045>;
-defm S_WAIT_ASYNCCNT : SOPP_Real_32_gfx12<0x04a>;
-defm S_WAIT_TENSORCNT : SOPP_Real_32_gfx12<0x04b>;
-
-//===----------------------------------------------------------------------===//
-// SOPP - GFX11, GFX12.
-//===----------------------------------------------------------------------===//
-
multiclass SOPP_Real_32_gfx11<bits<7> op, string name = !tolower(NAME)> {
defvar ps = !cast<SOPP_Pseudo>(NAME);
@@ -2747,94 +2946,91 @@ multiclass SOPP_Real_32_gfx11<bits<7> op, string name = !tolower(NAME)> {
}
}
-multiclass SOPP_Real_64_gfx12<bits<7> op> {
- def _gfx12 : SOPP_Real_64<op, !cast<SOPP_Pseudo>(NAME), !cast<SOPP_Pseudo>(NAME).Mnemonic>,
- Select<GFX12Gen, !cast<SOPP_Pseudo>(NAME).PseudoInstr>,
- SOPPRelaxTable<1, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx12">;
-}
-
multiclass SOPP_Real_64_gfx11<bits<7> op> {
def _gfx11 : SOPP_Real_64<op, !cast<SOPP_Pseudo>(NAME), !cast<SOPP_Pseudo>(NAME).Mnemonic>,
Select<GFX11Gen, !cast<SOPP_Pseudo>(NAME).PseudoInstr>,
SOPPRelaxTable<1, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx11">;
}
-multiclass SOPP_Real_32_gfx11_gfx12<bits<7> op> :
- SOPP_Real_32_gfx11<op>, SOPP_Real_32_gfx12<op>;
-
-multiclass SOPP_Real_32_gfx11_Renamed_gfx12<bits<7> op, string gfx12_name> :
- SOPP_Real_32_gfx11<op>, SOPP_Real_32_gfx12<op, gfx12_name>;
-
-multiclass SOPP_Real_With_Relaxation_gfx12<bits<7> op> {
- defm "" : SOPP_Real_32_gfx12<op>;
- let isCodeGenOnly = 1 in
- defm _pad_s_nop : SOPP_Real_64_gfx12<op>;
-}
-
multiclass SOPP_Real_With_Relaxation_gfx11<bits<7> op> {
defm "" : SOPP_Real_32_gfx11<op>;
let isCodeGenOnly = 1 in
defm _pad_s_nop : SOPP_Real_64_gfx11<op>;
}
-multiclass SOPP_Real_With_Relaxation_gfx11_gfx12<bits<7>op> :
- SOPP_Real_With_Relaxation_gfx11<op>, SOPP_Real_With_Relaxation_gfx12<op>;
-
-defm S_SETKILL : SOPP_Real_32_gfx11_gfx12<0x001>;
-defm S_SETHALT : SOPP_Real_32_gfx11_gfx12<0x002>;
-defm S_SLEEP : SOPP_Real_32_gfx11_gfx12<0x003>;
defm S_INST_PREFETCH : SOPP_Real_32_gfx11<0x004, "s_set_inst_prefetch_distance">;
-defm S_CLAUSE : SOPP_Real_32_gfx11_gfx12<0x005>;
-defm S_DELAY_ALU : SOPP_Real_32_gfx11_gfx12<0x007>;
-defm S_WAITCNT_DEPCTR : SOPP_Real_32_gfx11_Renamed_gfx12<0x008, "s_wait_alu">;
-defm S_WAITCNT : SOPP_Real_32_gfx11_gfx12<0x009>;
-defm S_WAIT_IDLE : SOPP_Real_32_gfx11_gfx12<0x00a>;
-defm S_WAIT_EVENT : SOPP_Real_32_gfx11_gfx12<0x00b>;
-defm S_TRAP : SOPP_Real_32_gfx11_gfx12<0x010>;
-defm S_ROUND_MODE : SOPP_Real_32_gfx11_gfx12<0x011>;
-defm S_DENORM_MODE : SOPP_Real_32_gfx11_gfx12<0x012>;
-defm S_BRANCH : SOPP_Real_With_Relaxation_gfx11_gfx12<0x020>;
-defm S_CBRANCH_SCC0 : SOPP_Real_With_Relaxation_gfx11_gfx12<0x021>;
-defm S_CBRANCH_SCC1 : SOPP_Real_With_Relaxation_gfx11_gfx12<0x022>;
-defm S_CBRANCH_VCCZ : SOPP_Real_With_Relaxation_gfx11_gfx12<0x023>;
-defm S_CBRANCH_VCCNZ : SOPP_Real_With_Relaxation_gfx11_gfx12<0x024>;
-defm S_CBRANCH_EXECZ : SOPP_Real_With_Relaxation_gfx11_gfx12<0x025>;
-defm S_CBRANCH_EXECNZ : SOPP_Real_With_Relaxation_gfx11_gfx12<0x026>;
defm S_CBRANCH_CDBGSYS : SOPP_Real_With_Relaxation_gfx11<0x027>;
defm S_CBRANCH_CDBGUSER : SOPP_Real_With_Relaxation_gfx11<0x028>;
defm S_CBRANCH_CDBGSYS_OR_USER : SOPP_Real_With_Relaxation_gfx11<0x029>;
defm S_CBRANCH_CDBGSYS_AND_USER : SOPP_Real_With_Relaxation_gfx11<0x02a>;
-defm S_ENDPGM : SOPP_Real_32_gfx11_gfx12<0x030>;
-defm S_ENDPGM_SAVED : SOPP_Real_32_gfx11_gfx12<0x031>;
defm S_ENDPGM_ORDERED_PS_DONE : SOPP_Real_32_gfx11<0x032>;
-defm S_WAKEUP : SOPP_Real_32_gfx11_gfx12<0x034>;
-defm S_SETPRIO : SOPP_Real_32_gfx11_gfx12<0x035>;
-defm S_SENDMSG : SOPP_Real_32_gfx11_gfx12<0x036>;
-defm S_SENDMSGHALT : SOPP_Real_32_gfx11_gfx12<0x037>;
-defm S_INCPERFLEVEL : SOPP_Real_32_gfx11_gfx12<0x038>;
-defm S_DECPERFLEVEL : SOPP_Real_32_gfx11_gfx12<0x039>;
-defm S_TTRACEDATA : SOPP_Real_32_gfx11_gfx12<0x03a>;
-defm S_TTRACEDATA_IMM : SOPP_Real_32_gfx11_gfx12<0x03b>;
-defm S_ICACHE_INV : SOPP_Real_32_gfx11_gfx12<0x03c>;
-
defm S_BARRIER : SOPP_Real_32_gfx11<0x03d>;
//===----------------------------------------------------------------------===//
-// SOPP - GFX1250.
+// SOPP - GFX10 only.
//===----------------------------------------------------------------------===//
-defm S_MONITOR_SLEEP : SOPP_Real_32_gfx12<0x004>;
+multiclass SOPP_Real_32_gfx10<bits<7> op> {
+ defvar ps = !cast<SOPP_Pseudo>(NAME);
+ def _gfx10 : SOPP_Real_32<op, ps>,
+ Select<GFX10Gen, ps.PseudoInstr>,
+ SOPPRelaxTable<0, ps.KeyName, "_gfx10">;
+}
+
+defm S_INST_PREFETCH : SOPP_Real_32_gfx10<0x020>;
//===----------------------------------------------------------------------===//
-// SOPP - GFX6, GFX7, GFX8, GFX9, GFX10
+// SOPP - GFX12, GFX13.
//===----------------------------------------------------------------------===//
-multiclass SOPP_Real_32_gfx6_gfx7<bits<7> op> {
- defvar ps = !cast<SOPP_Pseudo>(NAME);
- def _gfx6_gfx7 : SOPP_Real_32<op, ps, !cast<SOPP_Pseudo>(NAME).Mnemonic>,
- Select_gfx6_gfx7<ps.PseudoInstr>,
- SOPPRelaxTable<0, ps.KeyName, "_gfx6_gfx7">;
-}
+multiclass SOPP_Real_32_gfx12_gfx13<bits<7> op> :
+ SOPP_Real_32_gfx12<op>, SOPP_Real_32_gfx13<op>;
+
+defm S_WAIT_LOADCNT : SOPP_Real_32_gfx12_gfx13<0x040>;
+defm S_WAIT_STORECNT : SOPP_Real_32_gfx12_gfx13<0x041>;
+defm S_WAIT_SAMPLECNT : SOPP_Real_32_gfx12_gfx13<0x042>;
+defm S_WAIT_EXPCNT : SOPP_Real_32_gfx12_gfx13<0x044>;
+defm S_WAIT_DSCNT : SOPP_Real_32_gfx12_gfx13<0x046>;
+defm S_WAIT_KMCNT : SOPP_Real_32_gfx12_gfx13<0x047>;
+defm S_WAIT_LOADCNT_DSCNT : SOPP_Real_32_gfx12_gfx13<0x048>;
+defm S_WAIT_STORECNT_DSCNT : SOPP_Real_32_gfx12_gfx13<0x049>;
+
+//===----------------------------------------------------------------------===//
+// SOPP - GFX1250 only.
+//===----------------------------------------------------------------------===//
+
+defm S_MONITOR_SLEEP : SOPP_Real_32_gfx12<0x004>;
+defm S_SET_VGPR_MSB : SOPP_Real_32_gfx12<0x006>;
+defm S_WAIT_XCNT : SOPP_Real_32_gfx12<0x045>;
+
+//===----------------------------------------------------------------------===//
+// SOPP - GFX1250, GFX13
+//===----------------------------------------------------------------------===//
+
+defm S_SETPRIO_INC_WG : SOPP_Real_32_gfx12_gfx13<0x03e>;
+defm S_WAIT_ASYNCCNT : SOPP_Real_32_gfx12_gfx13<0x04a>;
+defm S_WAIT_TENSORCNT : SOPP_Real_32_gfx12_gfx13<0x04b>;
+
+//===----------------------------------------------------------------------===//
+// SOPP - GFX10, GFX13
+//===----------------------------------------------------------------------===//
+
+multiclass SOPP_Real_32_gfx10_gfx13<bits<7> op> :
+ SOPP_Real_32_gfx10<op>, SOPP_Real_32_gfx13<op>;
+
+multiclass SOPP_Real_32_gfx10_Renamed_gfx13<bits<7> op, string gfx13_name> :
+ SOPP_Real_32_gfx10<op>, SOPP_Real_32_gfx13<op, gfx13_name>;
+
+defm S_CLAUSE : SOPP_Real_32_gfx10_gfx13<0x021>;
+defm S_WAIT_IDLE : SOPP_Real_32_gfx10_gfx13<0x022>;
+defm S_ROUND_MODE : SOPP_Real_32_gfx10_gfx13<0x024>;
+defm S_DENORM_MODE : SOPP_Real_32_gfx10_gfx13<0x025>;
+defm S_TTRACEDATA_IMM : SOPP_Real_32_gfx10_gfx13<0x028>;
+defm S_WAITCNT_DEPCTR : SOPP_Real_32_gfx10_Renamed_gfx13<0x023, "s_wait_alu">;
+
+//===----------------------------------------------------------------------===//
+// SOPP - GFX8, GFX9.
+//===----------------------------------------------------------------------===//
multiclass SOPP_Real_32_gfx8_gfx9<bits<7> op> {
defvar ps = !cast<SOPP_Pseudo>(NAME);
@@ -2843,27 +3039,46 @@ multiclass SOPP_Real_32_gfx8_gfx9<bits<7> op> {
SOPPRelaxTable<0, ps.KeyName, "_vi">;
}
-multiclass SOPP_Real_32_gfx10<bits<7> op> {
+defm S_SET_GPR_IDX_OFF : SOPP_Real_32_gfx8_gfx9<0x01c>;
+defm S_SET_GPR_IDX_MODE : SOPP_Real_32_gfx8_gfx9<0x01d>;
+
+//===----------------------------------------------------------------------===//
+// SOPP - GFX6, GFX7, GFX8, GFX9, GFX10.
+//===----------------------------------------------------------------------===//
+
+multiclass SOPP_Real_32_gfx6_gfx7<bits<7> op> {
defvar ps = !cast<SOPP_Pseudo>(NAME);
- def _gfx10 : SOPP_Real_32<op, ps>,
- Select<GFX10Gen, ps.PseudoInstr>,
- SOPPRelaxTable<0, ps.KeyName, "_gfx10">;
+ def _gfx6_gfx7 : SOPP_Real_32<op, ps, !cast<SOPP_Pseudo>(NAME).Mnemonic>,
+ Select_gfx6_gfx7<ps.PseudoInstr>,
+ SOPPRelaxTable<0, ps.KeyName, "_gfx6_gfx7">;
}
-multiclass SOPP_Real_32_gfx8_gfx9_gfx10<bits<7> op> :
- SOPP_Real_32_gfx8_gfx9<op>, SOPP_Real_32_gfx10<op>;
-
multiclass SOPP_Real_32_gfx6_gfx7_gfx8_gfx9<bits<7> op> :
SOPP_Real_32_gfx6_gfx7<op>, SOPP_Real_32_gfx8_gfx9<op>;
multiclass SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<bits<7> op> :
SOPP_Real_32_gfx6_gfx7_gfx8_gfx9<op>, SOPP_Real_32_gfx10<op>;
-multiclass SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<bits<7> op> :
- SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<op>, SOPP_Real_32_gfx11_gfx12<op>;
+defm S_BARRIER : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00a>;
+defm S_WAITCNT : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00c>;
+
+//===----------------------------------------------------------------------===//
+// SOPP - GFX6, GFX7, GFX8, GFX9, GFX10, GFX11, GFX12, GFX13.
+//===----------------------------------------------------------------------===//
+
+multiclass SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<bits<7> op> :
+ SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<op>, SOPP_Real_32_gfx11<op>,
+ SOPP_Real_32_gfx12<op>, SOPP_Real_32_gfx13<op>;
+
+defm S_NOP : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x000>;
-multiclass SOPP_Real_32_gfx10_gfx11_gfx12<bits<7> op> :
- SOPP_Real_32_gfx10<op>, SOPP_Real_32_gfx11_gfx12<op>;
+//===----------------------------------------------------------------------===//
+// SOPP - GFX6, GFX7, GFX8, GFX9, GFX10, GFX13.
+//===----------------------------------------------------------------------===//
+
+multiclass SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<bits<7> op> :
+ SOPP_Real_32_gfx6_gfx7_gfx8_gfx9<op>, SOPP_Real_32_gfx10<op>,
+ SOPP_Real_32_gfx13<op>;
//64 bit encodings, for Relaxation
multiclass SOPP_Real_64_gfx6_gfx7<bits<7> op> {
@@ -2890,6 +3105,44 @@ multiclass SOPP_Real_64_gfx10<bits<7> op> {
multiclass SOPP_Real_64_gfx6_gfx7_gfx8_gfx9<bits<7> op> :
SOPP_Real_64_gfx6_gfx7<op>, SOPP_Real_64_gfx8_gfx9<op>;
+multiclass SOPP_Real_64_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<bits<7> op> :
+ SOPP_Real_64_gfx6_gfx7_gfx8_gfx9<op>, SOPP_Real_64_gfx10<op>,
+ SOPP_Real_64_gfx13<op>;
+
+multiclass SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<bits<7> op> {
+ defm "" : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<op>;
+ let isCodeGenOnly = 1 in
+ defm _pad_s_nop : SOPP_Real_64_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<op>;
+}
+
+defm S_ENDPGM : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x001>;
+defm S_SETHALT : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x00d>;
+defm S_SETKILL : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x00b>;
+defm S_SLEEP : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x00e>;
+defm S_SETPRIO : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x00f>;
+defm S_SENDMSG : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x010>;
+defm S_SENDMSGHALT : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x011>;
+defm S_TRAP : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x012>;
+defm S_ICACHE_INV : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x013>;
+defm S_INCPERFLEVEL : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x014>;
+defm S_DECPERFLEVEL : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x015>;
+defm S_TTRACEDATA : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x016>;
+defm S_ENDPGM_SAVED : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x01B>;
+
+let isBranch = 1 in {
+defm S_BRANCH : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x002>;
+defm S_CBRANCH_SCC0 : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x004>;
+defm S_CBRANCH_SCC1 : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x005>;
+defm S_CBRANCH_VCCZ : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x006>;
+defm S_CBRANCH_VCCNZ : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x007>;
+defm S_CBRANCH_EXECZ : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x008>;
+defm S_CBRANCH_EXECNZ : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x009>;
+}
+
+//===----------------------------------------------------------------------===//
+// SOPP - GFX6, GFX7, GFX8, GFX9, GFX10.
+//===----------------------------------------------------------------------===//
+
multiclass SOPP_Real_64_gfx6_gfx7_gfx8_gfx9_gfx10<bits<7> op> :
SOPP_Real_64_gfx6_gfx7_gfx8_gfx9<op>, SOPP_Real_64_gfx10<op>;
@@ -2900,43 +3153,7 @@ multiclass SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<bits<7> op> {
defm _pad_s_nop : SOPP_Real_64_gfx6_gfx7_gfx8_gfx9_gfx10<op>;
}
-defm S_NOP : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x000>;
-defm S_ENDPGM : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x001>;
-defm S_WAKEUP : SOPP_Real_32_gfx8_gfx9_gfx10<0x003>;
-defm S_BARRIER : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00a>;
-defm S_WAITCNT : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00c>;
-defm S_SETHALT : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00d>;
-defm S_SETKILL : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00b>;
-defm S_SLEEP : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00e>;
-defm S_SETPRIO : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00f>;
-defm S_SENDMSG : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x010>;
-defm S_SENDMSGHALT : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x011>;
-defm S_TRAP : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x012>;
-defm S_ICACHE_INV : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x013>;
-defm S_INCPERFLEVEL : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x014>;
-defm S_DECPERFLEVEL : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x015>;
-defm S_TTRACEDATA : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x016>;
-defm S_ENDPGM_SAVED : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x01B>;
-defm S_SET_GPR_IDX_OFF : SOPP_Real_32_gfx8_gfx9<0x01c>;
-defm S_SET_GPR_IDX_MODE : SOPP_Real_32_gfx8_gfx9<0x01d>;
-defm S_ENDPGM_ORDERED_PS_DONE : SOPP_Real_32_gfx8_gfx9_gfx10<0x01e>;
-defm S_CODE_END : SOPP_Real_32_gfx10_gfx11_gfx12<0x01f>;
-defm S_INST_PREFETCH : SOPP_Real_32_gfx10<0x020>;
-defm S_CLAUSE : SOPP_Real_32_gfx10<0x021>;
-defm S_WAIT_IDLE : SOPP_Real_32_gfx10<0x022>;
-defm S_WAITCNT_DEPCTR : SOPP_Real_32_gfx10<0x023>;
-defm S_ROUND_MODE : SOPP_Real_32_gfx10<0x024>;
-defm S_DENORM_MODE : SOPP_Real_32_gfx10<0x025>;
-defm S_TTRACEDATA_IMM : SOPP_Real_32_gfx10<0x028>;
-
let isBranch = 1 in {
-defm S_BRANCH : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x002>;
-defm S_CBRANCH_SCC0 : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x004>;
-defm S_CBRANCH_SCC1 : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x005>;
-defm S_CBRANCH_VCCZ : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x006>;
-defm S_CBRANCH_VCCNZ : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x007>;
-defm S_CBRANCH_EXECZ : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x008>;
-defm S_CBRANCH_EXECNZ : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x009>;
defm S_CBRANCH_CDBGSYS : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x017>;
defm S_CBRANCH_CDBGUSER : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x018>;
defm S_CBRANCH_CDBGSYS_OR_USER : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x019>;
@@ -2944,6 +3161,77 @@ defm S_CBRANCH_CDBGSYS_AND_USER : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_
}
//===----------------------------------------------------------------------===//
+// SOPP - GFX10, GFX11, GFX12, GFX13.
+//===----------------------------------------------------------------------===//
+
+multiclass SOPP_Real_32_gfx10_gfx11_gfx12_gfx13<bits<7> op> :
+ SOPP_Real_32_gfx10<op>, SOPP_Real_32_gfx11<op>, SOPP_Real_32_gfx12<op>,
+ SOPP_Real_32_gfx13<op>;
+
+defm S_CODE_END : SOPP_Real_32_gfx10_gfx11_gfx12_gfx13<0x01f>;
+
+//===----------------------------------------------------------------------===//
+// SOPP - GFX11, GFX12.
+//===----------------------------------------------------------------------===//
+
+multiclass SOPP_Real_32_gfx11_gfx12<bits<7> op> :
+ SOPP_Real_32_gfx11<op>, SOPP_Real_32_gfx12<op>;
+
+multiclass SOPP_Real_32_gfx11_Renamed_gfx12<bits<7> op, string gfx12_name> :
+ SOPP_Real_32_gfx11<op>, SOPP_Real_32_gfx12<op, gfx12_name>;
+
+multiclass SOPP_Real_With_Relaxation_gfx12<bits<7> op> {
+ defm "" : SOPP_Real_32_gfx12<op>;
+ let isCodeGenOnly = 1 in
+ defm _pad_s_nop : SOPP_Real_64_gfx12<op>;
+}
+
+multiclass SOPP_Real_With_Relaxation_gfx11_gfx12<bits<7>op> :
+ SOPP_Real_With_Relaxation_gfx11<op>, SOPP_Real_With_Relaxation_gfx12<op>;
+
+defm S_SETKILL : SOPP_Real_32_gfx11_gfx12<0x001>;
+defm S_SETHALT : SOPP_Real_32_gfx11_gfx12<0x002>;
+defm S_SLEEP : SOPP_Real_32_gfx11_gfx12<0x003>;
+defm S_CLAUSE : SOPP_Real_32_gfx11_gfx12<0x005>;
+defm S_DELAY_ALU : SOPP_Real_32_gfx11_gfx12<0x007>;
+defm S_WAITCNT_DEPCTR : SOPP_Real_32_gfx11_Renamed_gfx12<0x008, "s_wait_alu">;
+defm S_WAITCNT : SOPP_Real_32_gfx11_gfx12<0x009>;
+defm S_WAIT_IDLE : SOPP_Real_32_gfx11_gfx12<0x00a>;
+defm S_WAIT_EVENT : SOPP_Real_32_gfx11_gfx12<0x00b>;
+defm S_TRAP : SOPP_Real_32_gfx11_gfx12<0x010>;
+defm S_ROUND_MODE : SOPP_Real_32_gfx11_gfx12<0x011>;
+defm S_DENORM_MODE : SOPP_Real_32_gfx11_gfx12<0x012>;
+defm S_BRANCH : SOPP_Real_With_Relaxation_gfx11_gfx12<0x020>;
+defm S_CBRANCH_SCC0 : SOPP_Real_With_Relaxation_gfx11_gfx12<0x021>;
+defm S_CBRANCH_SCC1 : SOPP_Real_With_Relaxation_gfx11_gfx12<0x022>;
+defm S_CBRANCH_VCCZ : SOPP_Real_With_Relaxation_gfx11_gfx12<0x023>;
+defm S_CBRANCH_VCCNZ : SOPP_Real_With_Relaxation_gfx11_gfx12<0x024>;
+defm S_CBRANCH_EXECZ : SOPP_Real_With_Relaxation_gfx11_gfx12<0x025>;
+defm S_CBRANCH_EXECNZ : SOPP_Real_With_Relaxation_gfx11_gfx12<0x026>;
+defm S_ENDPGM : SOPP_Real_32_gfx11_gfx12<0x030>;
+defm S_ENDPGM_SAVED : SOPP_Real_32_gfx11_gfx12<0x031>;
+defm S_WAKEUP : SOPP_Real_32_gfx11_gfx12<0x034>;
+defm S_SETPRIO : SOPP_Real_32_gfx11_gfx12<0x035>;
+defm S_SENDMSG : SOPP_Real_32_gfx11_gfx12<0x036>;
+defm S_SENDMSGHALT : SOPP_Real_32_gfx11_gfx12<0x037>;
+defm S_INCPERFLEVEL : SOPP_Real_32_gfx11_gfx12<0x038>;
+defm S_DECPERFLEVEL : SOPP_Real_32_gfx11_gfx12<0x039>;
+defm S_TTRACEDATA : SOPP_Real_32_gfx11_gfx12<0x03a>;
+defm S_TTRACEDATA_IMM : SOPP_Real_32_gfx11_gfx12<0x03b>;
+defm S_ICACHE_INV : SOPP_Real_32_gfx11_gfx12<0x03c>;
+
+//===----------------------------------------------------------------------===//
+// SOPP - GFX8, GFX9, GFX10.
+//===----------------------------------------------------------------------===//
+
+multiclass SOPP_Real_32_gfx8_gfx9_gfx10<bits<7> op> :
+ SOPP_Real_32_gfx8_gfx9<op>, SOPP_Real_32_gfx10<op>;
+
+defm S_WAKEUP : SOPP_Real_32_gfx8_gfx9_gfx10<0x003>;
+defm S_ENDPGM_ORDERED_PS_DONE : SOPP_Real_32_gfx8_gfx9_gfx10<0x01e>;
+
+
+//===----------------------------------------------------------------------===//
// SOPC - GFX11, GFX12.
//===----------------------------------------------------------------------===//
@@ -2964,41 +3252,61 @@ defm S_CMP_EQ_U64 : SOPC_Real_gfx11_gfx12<0x10>;
defm S_CMP_LG_U64 : SOPC_Real_gfx11_gfx12<0x11>;
//===----------------------------------------------------------------------===//
-// SOPC - GFX1150, GFX12
+// SOPC - GFX1150, GFX12, GFX13
//===----------------------------------------------------------------------===//
-defm S_CMP_LT_F32 : SOPC_Real_gfx11_gfx12<0x41>;
-defm S_CMP_EQ_F32 : SOPC_Real_gfx11_gfx12<0x42>;
-defm S_CMP_LE_F32 : SOPC_Real_gfx11_gfx12<0x43>;
-defm S_CMP_GT_F32 : SOPC_Real_gfx11_gfx12<0x44>;
-defm S_CMP_LG_F32 : SOPC_Real_gfx11_gfx12<0x45>;
-defm S_CMP_GE_F32 : SOPC_Real_gfx11_gfx12<0x46>;
-defm S_CMP_O_F32 : SOPC_Real_gfx11_gfx12<0x47>;
-defm S_CMP_U_F32 : SOPC_Real_gfx11_gfx12<0x48>;
-defm S_CMP_NGE_F32 : SOPC_Real_gfx11_gfx12<0x49>;
-defm S_CMP_NLG_F32 : SOPC_Real_gfx11_gfx12<0x4a>;
-defm S_CMP_NGT_F32 : SOPC_Real_gfx11_gfx12<0x4b>;
-defm S_CMP_NLE_F32 : SOPC_Real_gfx11_gfx12<0x4c>;
-defm S_CMP_NEQ_F32 : SOPC_Real_gfx11_gfx12<0x4d>;
-defm S_CMP_NLT_F32 : SOPC_Real_gfx11_gfx12<0x4e>;
+multiclass SOPC_Real_gfx13<bits<7> op> {
+ def _gfx13 : SOPC_Real<op, !cast<SOPC_Pseudo>(NAME)>,
+ Select<GFX13Gen, !cast<SOPC_Pseudo>(NAME).Mnemonic>;
+}
-defm S_CMP_LT_F16 : SOPC_Real_gfx11_gfx12<0x51>;
-defm S_CMP_EQ_F16 : SOPC_Real_gfx11_gfx12<0x52>;
-defm S_CMP_LE_F16 : SOPC_Real_gfx11_gfx12<0x53>;
-defm S_CMP_GT_F16 : SOPC_Real_gfx11_gfx12<0x54>;
-defm S_CMP_LG_F16 : SOPC_Real_gfx11_gfx12<0x55>;
-defm S_CMP_GE_F16 : SOPC_Real_gfx11_gfx12<0x56>;
-defm S_CMP_O_F16 : SOPC_Real_gfx11_gfx12<0x57>;
-defm S_CMP_U_F16 : SOPC_Real_gfx11_gfx12<0x58>;
-defm S_CMP_NGE_F16 : SOPC_Real_gfx11_gfx12<0x59>;
-defm S_CMP_NLG_F16 : SOPC_Real_gfx11_gfx12<0x5a>;
-defm S_CMP_NGT_F16 : SOPC_Real_gfx11_gfx12<0x5b>;
-defm S_CMP_NLE_F16 : SOPC_Real_gfx11_gfx12<0x5c>;
-defm S_CMP_NEQ_F16 : SOPC_Real_gfx11_gfx12<0x5d>;
-defm S_CMP_NLT_F16 : SOPC_Real_gfx11_gfx12<0x5e>;
+multiclass SOPC_Real_gfx11_gfx12_gfx13<bits<7> op> :
+ SOPC_Real_gfx11<op>, SOPC_Real_gfx12<op>, SOPC_Real_gfx13<op>;
+
+defm S_CMP_LT_F32 : SOPC_Real_gfx11_gfx12_gfx13<0x41>;
+defm S_CMP_EQ_F32 : SOPC_Real_gfx11_gfx12_gfx13<0x42>;
+defm S_CMP_LE_F32 : SOPC_Real_gfx11_gfx12_gfx13<0x43>;
+defm S_CMP_GT_F32 : SOPC_Real_gfx11_gfx12_gfx13<0x44>;
+defm S_CMP_LG_F32 : SOPC_Real_gfx11_gfx12_gfx13<0x45>;
+defm S_CMP_GE_F32 : SOPC_Real_gfx11_gfx12_gfx13<0x46>;
+defm S_CMP_O_F32 : SOPC_Real_gfx11_gfx12_gfx13<0x47>;
+defm S_CMP_U_F32 : SOPC_Real_gfx11_gfx12_gfx13<0x48>;
+defm S_CMP_NGE_F32 : SOPC_Real_gfx11_gfx12_gfx13<0x49>;
+defm S_CMP_NLG_F32 : SOPC_Real_gfx11_gfx12_gfx13<0x4a>;
+defm S_CMP_NGT_F32 : SOPC_Real_gfx11_gfx12_gfx13<0x4b>;
+defm S_CMP_NLE_F32 : SOPC_Real_gfx11_gfx12_gfx13<0x4c>;
+defm S_CMP_NEQ_F32 : SOPC_Real_gfx11_gfx12_gfx13<0x4d>;
+defm S_CMP_NLT_F32 : SOPC_Real_gfx11_gfx12_gfx13<0x4e>;
+
+defm S_CMP_LT_F16 : SOPC_Real_gfx11_gfx12_gfx13<0x51>;
+defm S_CMP_EQ_F16 : SOPC_Real_gfx11_gfx12_gfx13<0x52>;
+defm S_CMP_LE_F16 : SOPC_Real_gfx11_gfx12_gfx13<0x53>;
+defm S_CMP_GT_F16 : SOPC_Real_gfx11_gfx12_gfx13<0x54>;
+defm S_CMP_LG_F16 : SOPC_Real_gfx11_gfx12_gfx13<0x55>;
+defm S_CMP_GE_F16 : SOPC_Real_gfx11_gfx12_gfx13<0x56>;
+defm S_CMP_O_F16 : SOPC_Real_gfx11_gfx12_gfx13<0x57>;
+defm S_CMP_U_F16 : SOPC_Real_gfx11_gfx12_gfx13<0x58>;
+defm S_CMP_NGE_F16 : SOPC_Real_gfx11_gfx12_gfx13<0x59>;
+defm S_CMP_NLG_F16 : SOPC_Real_gfx11_gfx12_gfx13<0x5a>;
+defm S_CMP_NGT_F16 : SOPC_Real_gfx11_gfx12_gfx13<0x5b>;
+defm S_CMP_NLE_F16 : SOPC_Real_gfx11_gfx12_gfx13<0x5c>;
+defm S_CMP_NEQ_F16 : SOPC_Real_gfx11_gfx12_gfx13<0x5d>;
+defm S_CMP_NLT_F16 : SOPC_Real_gfx11_gfx12_gfx13<0x5e>;
//===----------------------------------------------------------------------===//
-// SOPC - GFX6, GFX7, GFX8, GFX9, GFX10
+// SOPC - GFX8, GFX9.
+//===----------------------------------------------------------------------===//
+
+multiclass SOPC_Real_gfx8_gfx9<bits<7> op> {
+ defvar ps = !cast<SOPC_Pseudo>(NAME);
+ def _vi : SOPC_Real<op, ps>,
+ Select_vi<ps.PseudoInstr>;
+}
+
+defm S_SET_GPR_IDX_ON : SOPC_Real_gfx8_gfx9<0x11>;
+
+//===----------------------------------------------------------------------===//
+// SOPC - GFX6, GFX7, GFX8, GFX9.
//===----------------------------------------------------------------------===//
multiclass SOPC_Real_gfx6_gfx7<bits<7> op> {
@@ -3007,11 +3315,14 @@ multiclass SOPC_Real_gfx6_gfx7<bits<7> op> {
Select_gfx6_gfx7<ps.PseudoInstr>;
}
-multiclass SOPC_Real_gfx8_gfx9<bits<7> op> {
- defvar ps = !cast<SOPC_Pseudo>(NAME);
- def _vi : SOPC_Real<op, ps>,
- Select_vi<ps.PseudoInstr>;
-}
+multiclass SOPC_Real_gfx6_gfx7_gfx8_gfx9<bits<7> op> :
+ SOPC_Real_gfx6_gfx7<op>, SOPC_Real_gfx8_gfx9<op>;
+
+defm S_SETVSKIP : SOPC_Real_gfx6_gfx7_gfx8_gfx9<0x10>;
+
+//===----------------------------------------------------------------------===//
+// SOPC - GFX6, GFX7, GFX8, GFX9, GFX10, GFX13
+//===----------------------------------------------------------------------===//
multiclass SOPC_Real_gfx10<bits<7> op> {
defvar ps = !cast<SOPC_Pseudo>(NAME);
@@ -3019,36 +3330,36 @@ multiclass SOPC_Real_gfx10<bits<7> op> {
Select<GFX10Gen, ps.PseudoInstr>;
}
-multiclass SOPC_Real_gfx8_gfx9_gfx10<bits<7> op> :
- SOPC_Real_gfx8_gfx9<op>, SOPC_Real_gfx10<op>;
+multiclass SOPC_Real_gfx8_gfx9_gfx10_gfx13<bits<7> op> :
+ SOPC_Real_gfx8_gfx9<op>, SOPC_Real_gfx10<op>, SOPC_Real_gfx13<op>;
-multiclass SOPC_Real_gfx6_gfx7_gfx8_gfx9<bits<7> op> :
- SOPC_Real_gfx6_gfx7<op>, SOPC_Real_gfx8_gfx9<op>;
+defm S_CMP_EQ_U64 : SOPC_Real_gfx8_gfx9_gfx10_gfx13<0x12>;
+defm S_CMP_LG_U64 : SOPC_Real_gfx8_gfx9_gfx10_gfx13<0x13>;
+
+//===----------------------------------------------------------------------===//
+// SOPC - GFX6, GFX7, GFX8, GFX9, GFX10, GFX11, GFX12, GFX13
+//===----------------------------------------------------------------------===//
-multiclass SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<bits<7> op> :
+multiclass SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<bits<7> op> :
SOPC_Real_gfx6_gfx7_gfx8_gfx9<op>, SOPC_Real_gfx10<op>, SOPC_Real_gfx11<op>,
- SOPC_Real_gfx12<op>;
-
-defm S_CMP_EQ_I32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x00>;
-defm S_CMP_LG_I32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x01>;
-defm S_CMP_GT_I32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x02>;
-defm S_CMP_GE_I32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x03>;
-defm S_CMP_LT_I32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x04>;
-defm S_CMP_LE_I32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x05>;
-defm S_CMP_EQ_U32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x06>;
-defm S_CMP_LG_U32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x07>;
-defm S_CMP_GT_U32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x08>;
-defm S_CMP_GE_U32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x09>;
-defm S_CMP_LT_U32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x0a>;
-defm S_CMP_LE_U32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x0b>;
-defm S_BITCMP0_B32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x0c>;
-defm S_BITCMP1_B32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x0d>;
-defm S_BITCMP0_B64 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x0e>;
-defm S_BITCMP1_B64 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x0f>;
-defm S_SETVSKIP : SOPC_Real_gfx6_gfx7_gfx8_gfx9<0x10>;
-defm S_SET_GPR_IDX_ON : SOPC_Real_gfx8_gfx9<0x11>;
-defm S_CMP_EQ_U64 : SOPC_Real_gfx8_gfx9_gfx10<0x12>;
-defm S_CMP_LG_U64 : SOPC_Real_gfx8_gfx9_gfx10<0x13>;
+ SOPC_Real_gfx12<op>, SOPC_Real_gfx13<op>;
+
+defm S_CMP_EQ_I32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x00>;
+defm S_CMP_LG_I32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x01>;
+defm S_CMP_GT_I32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x02>;
+defm S_CMP_GE_I32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x03>;
+defm S_CMP_LT_I32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x04>;
+defm S_CMP_LE_I32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x05>;
+defm S_CMP_EQ_U32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x06>;
+defm S_CMP_LG_U32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x07>;
+defm S_CMP_GT_U32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x08>;
+defm S_CMP_GE_U32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x09>;
+defm S_CMP_LT_U32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x0a>;
+defm S_CMP_LE_U32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x0b>;
+defm S_BITCMP0_B32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x0c>;
+defm S_BITCMP1_B32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x0d>;
+defm S_BITCMP0_B64 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x0e>;
+defm S_BITCMP1_B64 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x0f>;
//===----------------------------------------------------------------------===//
// GFX8 (VI), GFX9.
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
index 6489e63..fddd9c7 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
@@ -99,7 +99,6 @@ static constexpr CustomOperand MsgOperands[] = {
{{"MSG_EARLY_PRIM_DEALLOC"}, ID_EARLY_PRIM_DEALLOC, isGFX9_GFX10},
{{"MSG_GS_ALLOC_REQ"}, ID_GS_ALLOC_REQ, isGFX9Plus},
{{"MSG_GET_DOORBELL"}, ID_GET_DOORBELL, isGFX9_GFX10},
- {{"MSG_SAVEWAVE_HAS_TDM"}, ID_SAVEWAVE_HAS_TDM, isGFX1250},
{{"MSG_GET_DDID"}, ID_GET_DDID, isGFX10},
{{"MSG_SYSMSG"}, ID_SYSMSG},
{{"MSG_RTN_GET_DOORBELL"}, ID_RTN_GET_DOORBELL, isGFX11Plus},
@@ -111,7 +110,8 @@ static constexpr CustomOperand MsgOperands[] = {
{{"MSG_RTN_GET_TBA_TO_PC"}, ID_RTN_GET_TBA_TO_PC, isGFX11Plus},
{{"MSG_RTN_GET_SE_AID_ID"}, ID_RTN_GET_SE_AID_ID, isGFX12Plus},
{{"MSG_RTN_GET_CLUSTER_BARRIER_STATE"}, ID_RTN_GET_CLUSTER_BARRIER_STATE,
- isGFX1250},
+ isGFX1250Plus},
+ {{"MSG_RTN_SAVE_WAVE_HAS_TDM"}, ID_RTN_SAVE_WAVE_HAS_TDM, isGFX1250Plus}
};
static constexpr CustomOperand SysMsgOperands[] = {
@@ -156,6 +156,26 @@ StringRef getMsgOpName(int64_t MsgId, uint64_t Encoding,
} // namespace SendMsg
+namespace WaitEvent {
+
+// clang-format off
+static constexpr CustomOperand WaitEventOperands[] = {
+ {{"{ export_ready: 0 }"}, 0, isGFX12Plus},
+ {{"{ dont_wait_export_ready: 0 }"}, 0, isGFX11},
+ {{"{ dont_wait_export_ready: 1 }"}, DONT_WAIT_EXPORT_READY, isGFX11},
+ {{"{ export_ready: 1 }"}, EXPORT_READY, isGFX12Plus}
+};
+// clang-format on
+
+int64_t getWaitEventMask(StringRef Name, const MCSubtargetInfo &STI) {
+ return getEncodingFromOperandTable(WaitEventOperands, Name, STI);
+}
+
+StringRef getWaitEventMaskName(uint64_t Encoding, const MCSubtargetInfo &STI) {
+ return getNameFromOperandTable(WaitEventOperands, Encoding, STI);
+}
+} // namespace WaitEvent
+
namespace Hwreg {
// Disable lint checking for this block since it makes the table unreadable.
@@ -211,8 +231,9 @@ static constexpr CustomOperand Operands[] = {
{{"HW_REG_HW_ID2"}, ID_HW_ID2, isGFX10Plus},
{{"HW_REG_SQ_PERF_SNAPSHOT_PC_HI"}, ID_SQ_PERF_SNAPSHOT_PC_HI, isGFX940},
{{"HW_REG_POPS_PACKER"}, ID_POPS_PACKER, isGFX10},
+ {{"HW_REG_WAVE_SCHED_MODE"}, ID_SCHED_MODE, isGFX12Plus},
{{"HW_REG_PERF_SNAPSHOT_DATA"}, ID_PERF_SNAPSHOT_DATA_gfx11, isGFX11},
- {{"HW_REG_IB_STS2"}, ID_IB_STS2, isGFX1250},
+ {{"HW_REG_IB_STS2"}, ID_IB_STS2, isGFX1250Plus},
{{"HW_REG_SHADER_CYCLES"}, ID_SHADER_CYCLES, isGFX10_3_GFX11},
{{"HW_REG_SHADER_CYCLES_LO"}, ID_SHADER_CYCLES, isGFX12Plus},
{{"HW_REG_SHADER_CYCLES_HI"}, ID_SHADER_CYCLES_HI, isGFX12Plus},
@@ -220,8 +241,8 @@ static constexpr CustomOperand Operands[] = {
{{"HW_REG_DVGPR_ALLOC_LO"}, ID_DVGPR_ALLOC_LO, isGFX12Plus},
{{"HW_REG_WAVE_DVGPR_ALLOC_HI"}, ID_DVGPR_ALLOC_HI, isGFX12Plus},
{{"HW_REG_DVGPR_ALLOC_HI"}, ID_DVGPR_ALLOC_HI, isGFX12Plus},
- {{"HW_REG_XNACK_STATE_PRIV"}, ID_XNACK_STATE_PRIV, isGFX1250},
- {{"HW_REG_XNACK_MASK"}, ID_XNACK_MASK_gfx1250, isGFX1250},
+ {{"HW_REG_XNACK_STATE_PRIV"}, ID_XNACK_STATE_PRIV, isGFX1250Plus},
+ {{"HW_REG_XNACK_MASK"}, ID_XNACK_MASK_gfx1250, isGFX1250Plus},
};
// clang-format on
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
index c84c1a7..5916e27 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
@@ -84,6 +84,11 @@ StringRef getMsgOpName(int64_t MsgId, uint64_t Encoding,
} // namespace SendMsg
+namespace WaitEvent {
+int64_t getWaitEventMask(StringRef Name, const MCSubtargetInfo &STI);
+StringRef getWaitEventMaskName(uint64_t Encoding, const MCSubtargetInfo &STI);
+} // namespace WaitEvent
+
namespace Hwreg { // Symbolic names for the hwreg(...) syntax.
int64_t getHwregId(StringRef Name, const MCSubtargetInfo &STI);
@@ -127,6 +132,20 @@ ArrayRef<GFXVersion> getGFXVersions();
} // namespace UCVersion
+namespace WMMAMods {
+// These should match enum values in SIDefines.h
+
+constexpr const char *const ModMatrixFmt[] = {
+ "MATRIX_FMT_FP8", "MATRIX_FMT_BF8", "MATRIX_FMT_FP6", "MATRIX_FMT_BF6",
+ "MATRIX_FMT_FP4"};
+
+constexpr const char *const ModMatrixScale[] = {"MATRIX_SCALE_ROW0",
+ "MATRIX_SCALE_ROW1"};
+
+constexpr const char *const ModMatrixScaleFmt[] = {
+ "MATRIX_SCALE_FMT_E8", "MATRIX_SCALE_FMT_E5M3", "MATRIX_SCALE_FMT_E4M3"};
+} // namespace WMMAMods
+
} // namespace AMDGPU
} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 3e1b058..3f32d11 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -177,7 +177,13 @@ inline unsigned getVaSsrcBitWidth() { return 1; }
inline unsigned getVaSsrcBitShift() { return 8; }
/// \returns HoldCnt bit shift
-inline unsigned getHoldCntWidth() { return 1; }
+inline unsigned getHoldCntWidth(unsigned VersionMajor, unsigned VersionMinor) {
+ static constexpr const unsigned MinMajor = 10;
+ static constexpr const unsigned MinMinor = 3;
+ return std::tie(VersionMajor, VersionMinor) >= std::tie(MinMajor, MinMinor)
+ ? 1
+ : 0;
+}
/// \returns HoldCnt bit shift
inline unsigned getHoldCntBitShift() { return 7; }
@@ -188,6 +194,10 @@ namespace llvm {
namespace AMDGPU {
+iota_range<InstCounterType> inst_counter_types(InstCounterType MaxCounter) {
+ return enum_seq(LOAD_CNT, MaxCounter);
+}
+
/// \returns true if the target supports signed immediate offset for SMRD
/// instructions.
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST) {
@@ -349,8 +359,8 @@ unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode,
}
struct MUBUFInfo {
- uint16_t Opcode;
- uint16_t BaseOpcode;
+ uint32_t Opcode;
+ uint32_t BaseOpcode;
uint8_t elements;
bool has_vaddr;
bool has_srsrc;
@@ -360,8 +370,8 @@ struct MUBUFInfo {
};
struct MTBUFInfo {
- uint16_t Opcode;
- uint16_t BaseOpcode;
+ uint32_t Opcode;
+ uint32_t BaseOpcode;
uint8_t elements;
bool has_vaddr;
bool has_srsrc;
@@ -369,25 +379,25 @@ struct MTBUFInfo {
};
struct SMInfo {
- uint16_t Opcode;
+ uint32_t Opcode;
bool IsBuffer;
};
struct VOPInfo {
- uint16_t Opcode;
+ uint32_t Opcode;
bool IsSingle;
};
struct VOPC64DPPInfo {
- uint16_t Opcode;
+ uint32_t Opcode;
};
struct VOPCDPPAsmOnlyInfo {
- uint16_t Opcode;
+ uint32_t Opcode;
};
struct VOP3CDPPAsmOnlyInfo {
- uint16_t Opcode;
+ uint32_t Opcode;
};
struct VOPDComponentInfo {
@@ -398,7 +408,7 @@ struct VOPDComponentInfo {
};
struct VOPDInfo {
- uint16_t Opcode;
+ uint32_t Opcode;
uint16_t OpX;
uint16_t OpY;
uint16_t Subtarget;
@@ -406,7 +416,7 @@ struct VOPDInfo {
};
struct VOPTrue16Info {
- uint16_t Opcode;
+ uint32_t Opcode;
bool IsTrue16;
};
@@ -414,16 +424,18 @@ struct VOPTrue16Info {
#define GET_FP4FP8DstByteSelTable_IMPL
struct DPMACCInstructionInfo {
- uint16_t Opcode;
+ uint32_t Opcode;
bool IsDPMACCInstruction;
};
struct FP4FP8DstByteSelInfo {
- uint16_t Opcode;
+ uint32_t Opcode;
bool HasFP8DstByteSel;
bool HasFP4DstByteSel;
};
+#define GET_DPMACCInstructionTable_DECL
+#define GET_DPMACCInstructionTable_IMPL
#define GET_MTBUFInfoTable_DECL
#define GET_MTBUFInfoTable_IMPL
#define GET_MUBUFInfoTable_DECL
@@ -729,6 +741,8 @@ bool isGenericAtomic(unsigned Opc) {
Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN ||
Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX ||
Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP ||
+ Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32 ||
+ Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32 ||
Opc == AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG;
}
@@ -780,6 +794,11 @@ FPType getFPDstSelType(unsigned Opc) {
return FPType::None;
}
+bool isDPMACCInstruction(unsigned Opc) {
+ const DPMACCInstructionInfo *Info = getDPMACCInstructionHelper(Opc);
+ return Info && Info->IsDPMACCInstruction;
+}
+
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc) {
const WMMAOpcodeMappingInfo *Info = getWMMAMappingInfoFrom2AddrOpcode(Opc);
return Info ? Info->Opcode3Addr : ~0u;
@@ -793,7 +812,7 @@ unsigned mapWMMA3AddrTo2AddrOpcode(unsigned Opc) {
// Wrapper for Tablegen'd function. enum Subtarget is not defined in any
// header files, so we need to wrap it in a function that takes unsigned
// instead.
-int getMCOpcode(uint16_t Opcode, unsigned Gen) {
+int64_t getMCOpcode(uint32_t Opcode, unsigned Gen) {
return getMCOpcodeGen(Opcode, static_cast<Subtarget>(Gen));
}
@@ -897,7 +916,7 @@ unsigned ComponentInfo::getIndexInParsedOperands(unsigned CompOprIdx) const {
}
std::optional<unsigned> InstInfo::getInvalidCompOperandIndex(
- std::function<unsigned(unsigned, unsigned)> GetRegIdx,
+ std::function<MCRegister(unsigned, unsigned)> GetRegIdx,
const MCRegisterInfo &MRI, bool SkipSrc, bool AllowSameVGPR,
bool VOPD3) const {
@@ -914,12 +933,13 @@ std::optional<unsigned> InstInfo::getInvalidCompOperandIndex(
BaseX = X;
if (!BaseY)
BaseY = Y;
- if ((BaseX & BanksMask) == (BaseY & BanksMask))
+ if ((BaseX.id() & BanksMask) == (BaseY.id() & BanksMask))
return true;
if (BaseX != X /* This is 64-bit register */ &&
- ((BaseX + 1) & BanksMask) == (BaseY & BanksMask))
+ ((BaseX.id() + 1) & BanksMask) == (BaseY.id() & BanksMask))
return true;
- if (BaseY != Y && (BaseX & BanksMask) == ((BaseY + 1) & BanksMask))
+ if (BaseY != Y &&
+ (BaseX.id() & BanksMask) == ((BaseY.id() + 1) & BanksMask))
return true;
// If both are 64-bit bank conflict will be detected yet while checking
@@ -968,7 +988,7 @@ std::optional<unsigned> InstInfo::getInvalidCompOperandIndex(
// if the operand is not a register or not a VGPR.
InstInfo::RegIndices
InstInfo::getRegIndices(unsigned CompIdx,
- std::function<unsigned(unsigned, unsigned)> GetRegIdx,
+ std::function<MCRegister(unsigned, unsigned)> GetRegIdx,
bool VOPD3) const {
assert(CompIdx < COMPONENTS_NUM);
@@ -983,7 +1003,7 @@ InstInfo::getRegIndices(unsigned CompIdx,
Comp.hasRegSrcOperand(CompSrcIdx)
? GetRegIdx(CompIdx,
Comp.getIndexOfSrcInMCOperands(CompSrcIdx, VOPD3))
- : 0;
+ : MCRegister();
}
return RegIndices;
}
@@ -1709,6 +1729,30 @@ bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val) {
return false;
}
+raw_ostream &operator<<(raw_ostream &OS, const AMDGPU::Waitcnt &Wait) {
+ ListSeparator LS;
+ if (Wait.LoadCnt != ~0u)
+ OS << LS << "LoadCnt: " << Wait.LoadCnt;
+ if (Wait.ExpCnt != ~0u)
+ OS << LS << "ExpCnt: " << Wait.ExpCnt;
+ if (Wait.DsCnt != ~0u)
+ OS << LS << "DsCnt: " << Wait.DsCnt;
+ if (Wait.StoreCnt != ~0u)
+ OS << LS << "StoreCnt: " << Wait.StoreCnt;
+ if (Wait.SampleCnt != ~0u)
+ OS << LS << "SampleCnt: " << Wait.SampleCnt;
+ if (Wait.BvhCnt != ~0u)
+ OS << LS << "BvhCnt: " << Wait.BvhCnt;
+ if (Wait.KmCnt != ~0u)
+ OS << LS << "KmCnt: " << Wait.KmCnt;
+ if (Wait.XCnt != ~0u)
+ OS << LS << "XCnt: " << Wait.XCnt;
+ if (LS.unused())
+ OS << "none";
+ OS << '\n';
+ return OS;
+}
+
unsigned getVmcntBitMask(const IsaVersion &Version) {
return (1 << (getVmcntBitWidthLo(Version.Major) +
getVmcntBitWidthHi(Version.Major))) -
@@ -1751,6 +1795,25 @@ unsigned getStorecntBitMask(const IsaVersion &Version) {
return (1 << getStorecntBitWidth(Version.Major)) - 1;
}
+HardwareLimits::HardwareLimits(const IsaVersion &IV) {
+ bool HasExtendedWaitCounts = IV.Major >= 12;
+ if (HasExtendedWaitCounts) {
+ LoadcntMax = getLoadcntBitMask(IV);
+ DscntMax = getDscntBitMask(IV);
+ } else {
+ LoadcntMax = getVmcntBitMask(IV);
+ DscntMax = getLgkmcntBitMask(IV);
+ }
+ ExpcntMax = getExpcntBitMask(IV);
+ StorecntMax = getStorecntBitMask(IV);
+ SamplecntMax = getSamplecntBitMask(IV);
+ BvhcntMax = getBvhcntBitMask(IV);
+ KmcntMax = getKmcntBitMask(IV);
+ XcntMax = getXcntBitMask(IV);
+ VaVdstMax = DepCtr::getVaVdstBitMask();
+ VmVsrcMax = DepCtr::getVmVsrcBitMask();
+}
+
unsigned getWaitcntBitMask(const IsaVersion &Version) {
unsigned VmcntLo = getBitMask(getVmcntBitShiftLo(Version.Major),
getVmcntBitWidthLo(Version.Major));
@@ -2019,6 +2082,22 @@ int encodeDepCtr(const StringRef Name, int64_t Val, unsigned &UsedOprMask,
STI);
}
+unsigned getVaVdstBitMask() { return (1 << getVaVdstBitWidth()) - 1; }
+
+unsigned getVaSdstBitMask() { return (1 << getVaSdstBitWidth()) - 1; }
+
+unsigned getVaSsrcBitMask() { return (1 << getVaSsrcBitWidth()) - 1; }
+
+unsigned getHoldCntBitMask(const IsaVersion &Version) {
+ return (1 << getHoldCntWidth(Version.Major, Version.Minor)) - 1;
+}
+
+unsigned getVmVsrcBitMask() { return (1 << getVmVsrcBitWidth()) - 1; }
+
+unsigned getVaVccBitMask() { return (1 << getVaVccBitWidth()) - 1; }
+
+unsigned getSaSdstBitMask() { return (1 << getSaSdstBitWidth()) - 1; }
+
unsigned decodeFieldVmVsrc(unsigned Encoded) {
return unpackBits(Encoded, getVmVsrcBitShift(), getVmVsrcBitWidth());
}
@@ -2043,64 +2122,74 @@ unsigned decodeFieldVaSsrc(unsigned Encoded) {
return unpackBits(Encoded, getVaSsrcBitShift(), getVaSsrcBitWidth());
}
-unsigned decodeFieldHoldCnt(unsigned Encoded) {
- return unpackBits(Encoded, getHoldCntBitShift(), getHoldCntWidth());
+unsigned decodeFieldHoldCnt(unsigned Encoded, const IsaVersion &Version) {
+ return unpackBits(Encoded, getHoldCntBitShift(),
+ getHoldCntWidth(Version.Major, Version.Minor));
}
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc) {
return packBits(VmVsrc, Encoded, getVmVsrcBitShift(), getVmVsrcBitWidth());
}
-unsigned encodeFieldVmVsrc(unsigned VmVsrc) {
- return encodeFieldVmVsrc(0xffff, VmVsrc);
+unsigned encodeFieldVmVsrc(unsigned VmVsrc, const MCSubtargetInfo &STI) {
+ unsigned Encoded = getDefaultDepCtrEncoding(STI);
+ return encodeFieldVmVsrc(Encoded, VmVsrc);
}
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst) {
return packBits(VaVdst, Encoded, getVaVdstBitShift(), getVaVdstBitWidth());
}
-unsigned encodeFieldVaVdst(unsigned VaVdst) {
- return encodeFieldVaVdst(0xffff, VaVdst);
+unsigned encodeFieldVaVdst(unsigned VaVdst, const MCSubtargetInfo &STI) {
+ unsigned Encoded = getDefaultDepCtrEncoding(STI);
+ return encodeFieldVaVdst(Encoded, VaVdst);
}
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst) {
return packBits(SaSdst, Encoded, getSaSdstBitShift(), getSaSdstBitWidth());
}
-unsigned encodeFieldSaSdst(unsigned SaSdst) {
- return encodeFieldSaSdst(0xffff, SaSdst);
+unsigned encodeFieldSaSdst(unsigned SaSdst, const MCSubtargetInfo &STI) {
+ unsigned Encoded = getDefaultDepCtrEncoding(STI);
+ return encodeFieldSaSdst(Encoded, SaSdst);
}
unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst) {
return packBits(VaSdst, Encoded, getVaSdstBitShift(), getVaSdstBitWidth());
}
-unsigned encodeFieldVaSdst(unsigned VaSdst) {
- return encodeFieldVaSdst(0xffff, VaSdst);
+unsigned encodeFieldVaSdst(unsigned VaSdst, const MCSubtargetInfo &STI) {
+ unsigned Encoded = getDefaultDepCtrEncoding(STI);
+ return encodeFieldVaSdst(Encoded, VaSdst);
}
unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc) {
return packBits(VaVcc, Encoded, getVaVccBitShift(), getVaVccBitWidth());
}
-unsigned encodeFieldVaVcc(unsigned VaVcc) {
- return encodeFieldVaVcc(0xffff, VaVcc);
+unsigned encodeFieldVaVcc(unsigned VaVcc, const MCSubtargetInfo &STI) {
+ unsigned Encoded = getDefaultDepCtrEncoding(STI);
+ return encodeFieldVaVcc(Encoded, VaVcc);
}
unsigned encodeFieldVaSsrc(unsigned Encoded, unsigned VaSsrc) {
return packBits(VaSsrc, Encoded, getVaSsrcBitShift(), getVaSsrcBitWidth());
}
-unsigned encodeFieldVaSsrc(unsigned VaSsrc) {
- return encodeFieldVaSsrc(0xffff, VaSsrc);
+unsigned encodeFieldVaSsrc(unsigned VaSsrc, const MCSubtargetInfo &STI) {
+ unsigned Encoded = getDefaultDepCtrEncoding(STI);
+ return encodeFieldVaSsrc(Encoded, VaSsrc);
}
-unsigned encodeFieldHoldCnt(unsigned Encoded, unsigned HoldCnt) {
- return packBits(HoldCnt, Encoded, getHoldCntBitShift(), getHoldCntWidth());
+unsigned encodeFieldHoldCnt(unsigned Encoded, unsigned HoldCnt,
+ const IsaVersion &Version) {
+ return packBits(HoldCnt, Encoded, getHoldCntBitShift(),
+ getHoldCntWidth(Version.Major, Version.Minor));
}
-unsigned encodeFieldHoldCnt(unsigned HoldCnt) {
- return encodeFieldHoldCnt(0xffff, HoldCnt);
+unsigned encodeFieldHoldCnt(unsigned HoldCnt, const MCSubtargetInfo &STI) {
+ unsigned Encoded = getDefaultDepCtrEncoding(STI);
+ return encodeFieldHoldCnt(Encoded, HoldCnt, getIsaVersion(STI.getCPU()));
}
} // namespace DepCtr
@@ -2450,7 +2539,7 @@ unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler) {
}
unsigned getMaxNumUserSGPRs(const MCSubtargetInfo &STI) {
- if (isGFX1250(STI))
+ if (isGFX1250Plus(STI))
return 32;
return 16;
}
@@ -2517,14 +2606,26 @@ bool isGFX12(const MCSubtargetInfo &STI) {
return STI.getFeatureBits()[AMDGPU::FeatureGFX12];
}
-bool isGFX12Plus(const MCSubtargetInfo &STI) { return isGFX12(STI); }
+bool isGFX12Plus(const MCSubtargetInfo &STI) {
+ return isGFX12(STI) || isGFX13Plus(STI);
+}
bool isNotGFX12Plus(const MCSubtargetInfo &STI) { return !isGFX12Plus(STI); }
bool isGFX1250(const MCSubtargetInfo &STI) {
+ return STI.getFeatureBits()[AMDGPU::FeatureGFX1250Insts] && !isGFX13(STI);
+}
+
+bool isGFX1250Plus(const MCSubtargetInfo &STI) {
return STI.getFeatureBits()[AMDGPU::FeatureGFX1250Insts];
}
+bool isGFX13(const MCSubtargetInfo &STI) {
+ return STI.getFeatureBits()[AMDGPU::FeatureGFX13];
+}
+
+bool isGFX13Plus(const MCSubtargetInfo &STI) { return isGFX13(STI); }
+
bool supportsWGP(const MCSubtargetInfo &STI) {
if (isGFX1250(STI))
return false;
@@ -2578,7 +2679,7 @@ bool hasMAIInsts(const MCSubtargetInfo &STI) {
}
bool hasVOPD(const MCSubtargetInfo &STI) {
- return STI.hasFeature(AMDGPU::FeatureVOPD);
+ return STI.hasFeature(AMDGPU::FeatureVOPDInsts);
}
bool hasDPPSrc1SGPR(const MCSubtargetInfo &STI) {
@@ -2697,8 +2798,8 @@ MCRegister getMCReg(MCRegister Reg, const MCSubtargetInfo &STI) {
MCRegister mc2PseudoReg(MCRegister Reg) { MAP_REG2REG }
-bool isInlineValue(unsigned Reg) {
- switch (Reg) {
+bool isInlineValue(MCRegister Reg) {
+ switch (Reg.id()) {
case AMDGPU::SRC_SHARED_BASE_LO:
case AMDGPU::SRC_SHARED_BASE:
case AMDGPU::SRC_SHARED_LIMIT_LO:
@@ -2743,6 +2844,7 @@ bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) {
case AMDGPU::OPERAND_REG_IMM_FP64:
case AMDGPU::OPERAND_REG_IMM_FP16:
case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT:
case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_C_FP32:
case AMDGPU::OPERAND_REG_INLINE_C_FP64:
@@ -3104,6 +3206,34 @@ std::optional<unsigned> getInlineEncodingV2F16(uint32_t Literal) {
return getInlineEncodingV216(true, Literal);
}
+// Encoding of the literal as an inline constant for V_PK_FMAC_F16 instruction
+// or nullopt. This accounts for different inline constant behavior:
+// - Pre-GFX11: fp16 inline constants have the value in low 16 bits, 0 in high
+// - GFX11+: fp16 inline constants are duplicated into both halves
+std::optional<unsigned> getPKFMACF16InlineEncoding(uint32_t Literal,
+ bool IsGFX11Plus) {
+ // Pre-GFX11 behavior: f16 in low bits, 0 in high bits
+ if (!IsGFX11Plus)
+ return getInlineEncodingV216(/*IsFloat=*/true, Literal);
+
+ // GFX11+ behavior: f16 duplicated in both halves
+ // First, check for sign-extended integer inline constants (-16 to 64)
+ // These work the same across all generations
+ int32_t Signed = static_cast<int32_t>(Literal);
+ if (Signed >= 0 && Signed <= 64)
+ return 128 + Signed;
+
+ if (Signed >= -16 && Signed <= -1)
+ return 192 + std::abs(Signed);
+
+ // For float inline constants on GFX11+, both halves must be equal
+ uint16_t Lo = static_cast<uint16_t>(Literal);
+ uint16_t Hi = static_cast<uint16_t>(Literal >> 16);
+ if (Lo != Hi)
+ return std::nullopt;
+ return getInlineEncodingV216(/*IsFloat=*/true, Lo);
+}
+
// Whether the given literal can be inlined for a V_PK_* instruction.
bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType) {
switch (OpType) {
@@ -3113,6 +3243,8 @@ bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType) {
case AMDGPU::OPERAND_REG_IMM_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
return getInlineEncodingV216(true, Literal).has_value();
+ case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT:
+ llvm_unreachable("OPERAND_REG_IMM_V2FP16_SPLAT is not supported");
case AMDGPU::OPERAND_REG_IMM_V2BF16:
case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
return isInlinableLiteralV2BF16(Literal);
@@ -3138,6 +3270,11 @@ bool isInlinableLiteralV2F16(uint32_t Literal) {
return getInlineEncodingV2F16(Literal).has_value();
}
+// Whether the given literal can be inlined for V_PK_FMAC_F16 instruction.
+bool isPKFMACF16InlineConstant(uint32_t Literal, bool IsGFX11Plus) {
+ return getPKFMACF16InlineEncoding(Literal, IsGFX11Plus).has_value();
+}
+
bool isValid32BitLiteral(uint64_t Val, bool IsFP64) {
if (IsFP64)
return !Lo_32(Val);
@@ -3159,6 +3296,7 @@ int64_t encode32BitLiteral(int64_t Imm, OperandType Type, bool IsLit) {
case OPERAND_REG_IMM_INT32:
case OPERAND_REG_IMM_V2BF16:
case OPERAND_REG_IMM_V2FP16:
+ case OPERAND_REG_IMM_V2FP16_SPLAT:
case OPERAND_REG_IMM_V2FP32:
case OPERAND_REG_IMM_V2INT16:
case OPERAND_REG_IMM_V2INT32:
@@ -3361,7 +3499,7 @@ const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t Format,
: getGfx9BufferFormatInfo(Format);
}
-const MCRegisterClass *getVGPRPhysRegClass(MCPhysReg Reg,
+const MCRegisterClass *getVGPRPhysRegClass(MCRegister Reg,
const MCRegisterInfo &MRI) {
const unsigned VGPRClasses[] = {
AMDGPU::VGPR_16RegClassID, AMDGPU::VGPR_32RegClassID,
@@ -3382,22 +3520,22 @@ const MCRegisterClass *getVGPRPhysRegClass(MCPhysReg Reg,
return nullptr;
}
-unsigned getVGPREncodingMSBs(MCPhysReg Reg, const MCRegisterInfo &MRI) {
+unsigned getVGPREncodingMSBs(MCRegister Reg, const MCRegisterInfo &MRI) {
unsigned Enc = MRI.getEncodingValue(Reg);
unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK;
return Idx >> 8;
}
-MCPhysReg getVGPRWithMSBs(MCPhysReg Reg, unsigned MSBs,
- const MCRegisterInfo &MRI) {
+MCRegister getVGPRWithMSBs(MCRegister Reg, unsigned MSBs,
+ const MCRegisterInfo &MRI) {
unsigned Enc = MRI.getEncodingValue(Reg);
unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK;
if (Idx >= 0x100)
- return AMDGPU::NoRegister;
+ return MCRegister();
const MCRegisterClass *RC = getVGPRPhysRegClass(Reg, MRI);
if (!RC)
- return AMDGPU::NoRegister;
+ return MCRegister();
Idx |= MSBs << 8;
if (RC->getID() == AMDGPU::VGPR_16RegClassID) {
@@ -3438,17 +3576,42 @@ getVGPRLoweringOperandTables(const MCInstrDesc &Desc) {
AMDGPU::OpName::src0Y, AMDGPU::OpName::vsrc1Y, AMDGPU::OpName::vsrc2Y,
AMDGPU::OpName::vdstY};
+ // VOP2 MADMK instructions use src0, imm, src1 scheme.
+ static const AMDGPU::OpName VOP2MADMKOps[4] = {
+ AMDGPU::OpName::src0, AMDGPU::OpName::NUM_OPERAND_NAMES,
+ AMDGPU::OpName::src1, AMDGPU::OpName::vdst};
+ static const AMDGPU::OpName VOPDFMAMKOpsX[4] = {
+ AMDGPU::OpName::src0X, AMDGPU::OpName::NUM_OPERAND_NAMES,
+ AMDGPU::OpName::vsrc1X, AMDGPU::OpName::vdstX};
+ static const AMDGPU::OpName VOPDFMAMKOpsY[4] = {
+ AMDGPU::OpName::src0Y, AMDGPU::OpName::NUM_OPERAND_NAMES,
+ AMDGPU::OpName::vsrc1Y, AMDGPU::OpName::vdstY};
+
unsigned TSFlags = Desc.TSFlags;
if (TSFlags &
(SIInstrFlags::VOP1 | SIInstrFlags::VOP2 | SIInstrFlags::VOP3 |
SIInstrFlags::VOP3P | SIInstrFlags::VOPC | SIInstrFlags::DPP)) {
+ switch (Desc.getOpcode()) {
// LD_SCALE operands ignore MSB.
- if (Desc.getOpcode() == AMDGPU::V_WMMA_LD_SCALE_PAIRED_B32 ||
- Desc.getOpcode() == AMDGPU::V_WMMA_LD_SCALE_PAIRED_B32_gfx1250 ||
- Desc.getOpcode() == AMDGPU::V_WMMA_LD_SCALE16_PAIRED_B64 ||
- Desc.getOpcode() == AMDGPU::V_WMMA_LD_SCALE16_PAIRED_B64_gfx1250)
+ case AMDGPU::V_WMMA_LD_SCALE_PAIRED_B32:
+ case AMDGPU::V_WMMA_LD_SCALE_PAIRED_B32_gfx1250:
+ case AMDGPU::V_WMMA_LD_SCALE16_PAIRED_B64:
+ case AMDGPU::V_WMMA_LD_SCALE16_PAIRED_B64_gfx1250:
return {};
+ case AMDGPU::V_FMAMK_F16:
+ case AMDGPU::V_FMAMK_F16_t16:
+ case AMDGPU::V_FMAMK_F16_t16_gfx12:
+ case AMDGPU::V_FMAMK_F16_fake16:
+ case AMDGPU::V_FMAMK_F16_fake16_gfx12:
+ case AMDGPU::V_FMAMK_F32:
+ case AMDGPU::V_FMAMK_F32_gfx12:
+ case AMDGPU::V_FMAMK_F64:
+ case AMDGPU::V_FMAMK_F64_gfx1250:
+ return {VOP2MADMKOps, nullptr};
+ default:
+ break;
+ }
return {VOPOps, nullptr};
}
@@ -3464,8 +3627,11 @@ getVGPRLoweringOperandTables(const MCInstrDesc &Desc) {
if (TSFlags & SIInstrFlags::VIMAGE)
return {VIMGOps, nullptr};
- if (AMDGPU::isVOPD(Desc.getOpcode()))
- return {VOPDOpsX, VOPDOpsY};
+ if (AMDGPU::isVOPD(Desc.getOpcode())) {
+ auto [OpX, OpY] = getVOPDComponents(Desc.getOpcode());
+ return {(OpX == AMDGPU::V_FMAMK_F32) ? VOPDFMAMKOpsX : VOPDOpsX,
+ (OpY == AMDGPU::V_FMAMK_F32) ? VOPDFMAMKOpsY : VOPDOpsY};
+ }
assert(!(TSFlags & SIInstrFlags::MIMG));
@@ -3545,8 +3711,15 @@ bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCInstrInfo &MII,
}
unsigned getLdsDwGranularity(const MCSubtargetInfo &ST) {
- return ST.hasFeature(AMDGPU::FeatureAddressableLocalMemorySize327680) ? 256
- : 128;
+ if (ST.getFeatureBits().test(FeatureAddressableLocalMemorySize32768))
+ return 64;
+ if (ST.getFeatureBits().test(FeatureAddressableLocalMemorySize65536))
+ return 128;
+ if (ST.getFeatureBits().test(FeatureAddressableLocalMemorySize163840))
+ return 320;
+ if (ST.getFeatureBits().test(FeatureAddressableLocalMemorySize327680))
+ return 512;
+ return 64; // In sync with getAddressableLocalMemorySize
}
bool isPackedFP32Inst(unsigned Opc) {
@@ -3599,9 +3772,9 @@ ClusterDimsAttr ClusterDimsAttr::get(const Function &F) {
if (!Attr.has_value())
AttrKind = Kind::Unknown;
- else if (all_of(*Attr, [](unsigned V) { return V == EncoNoCluster; }))
+ else if (all_of(*Attr, equal_to(EncoNoCluster)))
AttrKind = Kind::NoCluster;
- else if (all_of(*Attr, [](unsigned V) { return V == EncoVariableDims; }))
+ else if (all_of(*Attr, equal_to(EncoVariableDims)))
AttrKind = Kind::VariableDims;
ClusterDimsAttr A(AttrKind);
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 5e3195b..7500c24 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -98,7 +98,7 @@ struct GcnBufferFormatInfo {
};
struct MAIInstInfo {
- uint16_t Opcode;
+ uint32_t Opcode;
bool is_dgemm;
bool is_gfx940_xdl;
};
@@ -121,7 +121,7 @@ struct True16D16Info {
};
struct WMMAInstInfo {
- uint16_t Opcode;
+ uint32_t Opcode;
bool is_wmma_xdl;
};
@@ -416,7 +416,7 @@ inline bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx) {
}
LLVM_READONLY
-int getSOPPWithRelaxation(uint16_t Opcode);
+int64_t getSOPPWithRelaxation(uint32_t Opcode);
struct MIMGBaseOpcodeInfo {
MIMGBaseOpcode BaseOpcode;
@@ -522,8 +522,8 @@ unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode,
bool IsG16Supported);
struct MIMGInfo {
- uint16_t Opcode;
- uint16_t BaseOpcode;
+ uint32_t Opcode;
+ uint32_t BaseOpcode;
uint8_t MIMGEncoding;
uint8_t VDataDwords;
uint8_t VAddrDwords;
@@ -646,7 +646,7 @@ const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t Format,
const MCSubtargetInfo &STI);
LLVM_READONLY
-int getMCOpcode(uint16_t Opcode, unsigned Gen);
+int64_t getMCOpcode(uint32_t Opcode, unsigned Gen);
LLVM_READONLY
unsigned getVOPDOpcode(unsigned Opc, bool VOPD3);
@@ -909,7 +909,7 @@ private:
const ComponentInfo CompInfo[COMPONENTS_NUM];
public:
- using RegIndices = std::array<unsigned, Component::MAX_OPR_NUM>;
+ using RegIndices = std::array<MCRegister, Component::MAX_OPR_NUM>;
InstInfo(const MCInstrDesc &OpX, const MCInstrDesc &OpY)
: CompInfo{OpX, OpY} {}
@@ -932,9 +932,10 @@ public:
// even though it violates requirement to be from different banks.
// If \p VOPD3 is set to true both dst registers allowed to be either odd
// or even and instruction may have real src2 as opposed to tied accumulator.
- bool hasInvalidOperand(std::function<unsigned(unsigned, unsigned)> GetRegIdx,
- const MCRegisterInfo &MRI, bool SkipSrc = false,
- bool AllowSameVGPR = false, bool VOPD3 = false) const {
+ bool
+ hasInvalidOperand(std::function<MCRegister(unsigned, unsigned)> GetRegIdx,
+ const MCRegisterInfo &MRI, bool SkipSrc = false,
+ bool AllowSameVGPR = false, bool VOPD3 = false) const {
return getInvalidCompOperandIndex(GetRegIdx, MRI, SkipSrc, AllowSameVGPR,
VOPD3)
.has_value();
@@ -949,14 +950,14 @@ public:
// If \p VOPD3 is set to true both dst registers allowed to be either odd
// or even and instruction may have real src2 as opposed to tied accumulator.
std::optional<unsigned> getInvalidCompOperandIndex(
- std::function<unsigned(unsigned, unsigned)> GetRegIdx,
+ std::function<MCRegister(unsigned, unsigned)> GetRegIdx,
const MCRegisterInfo &MRI, bool SkipSrc = false,
bool AllowSameVGPR = false, bool VOPD3 = false) const;
private:
RegIndices
getRegIndices(unsigned ComponentIdx,
- std::function<unsigned(unsigned, unsigned)> GetRegIdx,
+ std::function<MCRegister(unsigned, unsigned)> GetRegIdx,
bool VOPD3) const;
};
@@ -1075,6 +1076,37 @@ getIntegerVecAttribute(const Function &F, StringRef Name, unsigned Size);
/// Checks if \p Val is inside \p MD, a !range-like metadata.
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val);
+enum InstCounterType {
+ LOAD_CNT = 0, // VMcnt prior to gfx12.
+ DS_CNT, // LKGMcnt prior to gfx12.
+ EXP_CNT, //
+ STORE_CNT, // VScnt in gfx10/gfx11.
+ NUM_NORMAL_INST_CNTS,
+ SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only.
+ BVH_CNT, // gfx12+ only.
+ KM_CNT, // gfx12+ only.
+ X_CNT, // gfx1250.
+ NUM_EXTENDED_INST_CNTS,
+ VA_VDST = NUM_EXTENDED_INST_CNTS, // gfx12+ expert mode only.
+ VM_VSRC, // gfx12+ expert mode only.
+ NUM_EXPERT_INST_CNTS,
+ NUM_INST_CNTS = NUM_EXPERT_INST_CNTS
+};
+
+// Return an iterator over all counters between LOAD_CNT (the first counter)
+// and \c MaxCounter (exclusive, default value yields an enumeration over
+// all counters).
+iota_range<InstCounterType>
+inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS);
+
+} // namespace AMDGPU
+
+template <> struct enum_iteration_traits<AMDGPU::InstCounterType> {
+ static constexpr bool is_iterable = true;
+};
+
+namespace AMDGPU {
+
/// Represents the counter values to wait for in an s_waitcnt instruction.
///
/// Large values (including the maximum possible integer) can be used to
@@ -1088,6 +1120,71 @@ struct Waitcnt {
unsigned BvhCnt = ~0u; // gfx12+ only.
unsigned KmCnt = ~0u; // gfx12+ only.
unsigned XCnt = ~0u; // gfx1250.
+ unsigned VaVdst = ~0u; // gfx12+ expert scheduling mode only.
+ unsigned VmVsrc = ~0u; // gfx12+ expert scheduling mode only.
+
+ unsigned get(InstCounterType T) const {
+ switch (T) {
+ case LOAD_CNT:
+ return LoadCnt;
+ case EXP_CNT:
+ return ExpCnt;
+ case DS_CNT:
+ return DsCnt;
+ case STORE_CNT:
+ return StoreCnt;
+ case SAMPLE_CNT:
+ return SampleCnt;
+ case BVH_CNT:
+ return BvhCnt;
+ case KM_CNT:
+ return KmCnt;
+ case X_CNT:
+ return XCnt;
+ case VA_VDST:
+ return VaVdst;
+ case VM_VSRC:
+ return VmVsrc;
+ default:
+ llvm_unreachable("bad InstCounterType");
+ }
+ }
+ void set(InstCounterType T, unsigned Val) {
+ switch (T) {
+ case LOAD_CNT:
+ LoadCnt = Val;
+ break;
+ case EXP_CNT:
+ ExpCnt = Val;
+ break;
+ case DS_CNT:
+ DsCnt = Val;
+ break;
+ case STORE_CNT:
+ StoreCnt = Val;
+ break;
+ case SAMPLE_CNT:
+ SampleCnt = Val;
+ break;
+ case BVH_CNT:
+ BvhCnt = Val;
+ break;
+ case KM_CNT:
+ KmCnt = Val;
+ break;
+ case X_CNT:
+ XCnt = Val;
+ break;
+ case VA_VDST:
+ VaVdst = Val;
+ break;
+ case VM_VSRC:
+ VmVsrc = Val;
+ break;
+ default:
+ llvm_unreachable("bad InstCounterType");
+ }
+ }
Waitcnt() = default;
// Pre-gfx12 constructor.
@@ -1096,19 +1193,24 @@ struct Waitcnt {
// gfx12+ constructor.
Waitcnt(unsigned LoadCnt, unsigned ExpCnt, unsigned DsCnt, unsigned StoreCnt,
- unsigned SampleCnt, unsigned BvhCnt, unsigned KmCnt, unsigned XCnt)
+ unsigned SampleCnt, unsigned BvhCnt, unsigned KmCnt, unsigned XCnt,
+ unsigned VaVdst, unsigned VmVsrc)
: LoadCnt(LoadCnt), ExpCnt(ExpCnt), DsCnt(DsCnt), StoreCnt(StoreCnt),
- SampleCnt(SampleCnt), BvhCnt(BvhCnt), KmCnt(KmCnt), XCnt(XCnt) {}
+ SampleCnt(SampleCnt), BvhCnt(BvhCnt), KmCnt(KmCnt), XCnt(XCnt),
+ VaVdst(VaVdst), VmVsrc(VmVsrc) {}
bool hasWait() const { return StoreCnt != ~0u || hasWaitExceptStoreCnt(); }
bool hasWaitExceptStoreCnt() const {
return LoadCnt != ~0u || ExpCnt != ~0u || DsCnt != ~0u ||
- SampleCnt != ~0u || BvhCnt != ~0u || KmCnt != ~0u || XCnt != ~0u;
+ SampleCnt != ~0u || BvhCnt != ~0u || KmCnt != ~0u || XCnt != ~0u ||
+ VaVdst != ~0u || VmVsrc != ~0u;
}
bool hasWaitStoreCnt() const { return StoreCnt != ~0u; }
+ bool hasWaitDepctr() const { return VaVdst != ~0u || VmVsrc != ~0u; }
+
Waitcnt combined(const Waitcnt &Other) const {
// Does the right thing provided self and Other are either both pre-gfx12
// or both gfx12+.
@@ -1116,8 +1218,30 @@ struct Waitcnt {
std::min(LoadCnt, Other.LoadCnt), std::min(ExpCnt, Other.ExpCnt),
std::min(DsCnt, Other.DsCnt), std::min(StoreCnt, Other.StoreCnt),
std::min(SampleCnt, Other.SampleCnt), std::min(BvhCnt, Other.BvhCnt),
- std::min(KmCnt, Other.KmCnt), std::min(XCnt, Other.XCnt));
+ std::min(KmCnt, Other.KmCnt), std::min(XCnt, Other.XCnt),
+ std::min(VaVdst, Other.VaVdst), std::min(VmVsrc, Other.VmVsrc));
}
+
+ friend raw_ostream &operator<<(raw_ostream &OS, const AMDGPU::Waitcnt &Wait);
+};
+
+/// Represents the hardware counter limits for different wait count types.
+struct HardwareLimits {
+ unsigned LoadcntMax; // Corresponds to Vmcnt prior to gfx12.
+ unsigned ExpcntMax;
+ unsigned DscntMax; // Corresponds to LGKMcnt prior to gfx12.
+ unsigned StorecntMax; // Corresponds to VScnt in gfx10/gfx11.
+ unsigned SamplecntMax; // gfx12+ only.
+ unsigned BvhcntMax; // gfx12+ only.
+ unsigned KmcntMax; // gfx12+ only.
+ unsigned XcntMax; // gfx1250.
+ unsigned VaVdstMax; // gfx12+ expert mode only.
+ unsigned VmVsrcMax; // gfx12+ expert mode only.
+
+ HardwareLimits() = default;
+
+ /// Initializes hardware limits from ISA version.
+ HardwareLimits(const IsaVersion &IV);
};
// The following methods are only meaningful on targets that support
@@ -1278,6 +1402,27 @@ bool isSymbolicDepCtrEncoding(unsigned Code, bool &HasNonDefaultVal,
bool decodeDepCtr(unsigned Code, int &Id, StringRef &Name, unsigned &Val,
bool &IsDefault, const MCSubtargetInfo &STI);
+/// \returns Maximum VaVdst value that can be encoded.
+unsigned getVaVdstBitMask();
+
+/// \returns Maximum VaSdst value that can be encoded.
+unsigned getVaSdstBitMask();
+
+/// \returns Maximum VaSsrc value that can be encoded.
+unsigned getVaSsrcBitMask();
+
+/// \returns Maximum HoldCnt value that can be encoded.
+unsigned getHoldCntBitMask(const IsaVersion &Version);
+
+/// \returns Maximum VmVsrc value that can be encoded.
+unsigned getVmVsrcBitMask();
+
+/// \returns Maximum VaVcc value that can be encoded.
+unsigned getVaVccBitMask();
+
+/// \returns Maximum SaSdst value that can be encoded.
+unsigned getSaSdstBitMask();
+
/// \returns Decoded VaVdst from given immediate \p Encoded.
unsigned decodeFieldVaVdst(unsigned Encoded);
@@ -1297,46 +1442,47 @@ unsigned decodeFieldVaVcc(unsigned Encoded);
unsigned decodeFieldVaSsrc(unsigned Encoded);
/// \returns Decoded HoldCnt from given immediate \p Encoded.
-unsigned decodeFieldHoldCnt(unsigned Encoded);
+unsigned decodeFieldHoldCnt(unsigned Encoded, const IsaVersion &Version);
/// \returns \p VmVsrc as an encoded Depctr immediate.
-unsigned encodeFieldVmVsrc(unsigned VmVsrc);
+unsigned encodeFieldVmVsrc(unsigned VmVsrc, const MCSubtargetInfo &STI);
/// \returns \p Encoded combined with encoded \p VmVsrc.
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc);
/// \returns \p VaVdst as an encoded Depctr immediate.
-unsigned encodeFieldVaVdst(unsigned VaVdst);
+unsigned encodeFieldVaVdst(unsigned VaVdst, const MCSubtargetInfo &STI);
/// \returns \p Encoded combined with encoded \p VaVdst.
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst);
/// \returns \p SaSdst as an encoded Depctr immediate.
-unsigned encodeFieldSaSdst(unsigned SaSdst);
+unsigned encodeFieldSaSdst(unsigned SaSdst, const MCSubtargetInfo &STI);
/// \returns \p Encoded combined with encoded \p SaSdst.
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst);
/// \returns \p VaSdst as an encoded Depctr immediate.
-unsigned encodeFieldVaSdst(unsigned VaSdst);
+unsigned encodeFieldVaSdst(unsigned VaSdst, const MCSubtargetInfo &STI);
/// \returns \p Encoded combined with encoded \p VaSdst.
unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst);
/// \returns \p VaVcc as an encoded Depctr immediate.
-unsigned encodeFieldVaVcc(unsigned VaVcc);
+unsigned encodeFieldVaVcc(unsigned VaVcc, const MCSubtargetInfo &STI);
/// \returns \p Encoded combined with encoded \p VaVcc.
unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc);
/// \returns \p HoldCnt as an encoded Depctr immediate.
-unsigned encodeFieldHoldCnt(unsigned HoldCnt);
+unsigned encodeFieldHoldCnt(unsigned HoldCnt, const MCSubtargetInfo &STI);
/// \returns \p Encoded combined with encoded \p HoldCnt.
-unsigned encodeFieldHoldCnt(unsigned HoldCnt, unsigned Encoded);
+unsigned encodeFieldHoldCnt(unsigned Encoded, unsigned HoldCnt,
+ const IsaVersion &Version);
/// \returns \p VaSsrc as an encoded Depctr immediate.
-unsigned encodeFieldVaSsrc(unsigned VaSsrc);
+unsigned encodeFieldVaSsrc(unsigned VaSsrc, const MCSubtargetInfo &STI);
/// \returns \p Encoded combined with encoded \p VaSsrc.
unsigned encodeFieldVaSsrc(unsigned Encoded, unsigned VaSsrc);
@@ -1513,6 +1659,8 @@ constexpr inline bool isKernel(CallingConv::ID CC) {
}
}
+inline bool isKernel(const Function &F) { return isKernel(F.getCallingConv()); }
+
LLVM_READNONE
constexpr bool canGuaranteeTCO(CallingConv::ID CC) {
return CC == CallingConv::Fast;
@@ -1561,6 +1709,9 @@ bool isGFX11Plus(const MCSubtargetInfo &STI);
bool isGFX12(const MCSubtargetInfo &STI);
bool isGFX12Plus(const MCSubtargetInfo &STI);
bool isGFX1250(const MCSubtargetInfo &STI);
+bool isGFX1250Plus(const MCSubtargetInfo &STI);
+bool isGFX13(const MCSubtargetInfo &STI);
+bool isGFX13Plus(const MCSubtargetInfo &STI);
bool supportsWGP(const MCSubtargetInfo &STI);
bool isNotGFX12Plus(const MCSubtargetInfo &STI);
bool isNotGFX11Plus(const MCSubtargetInfo &STI);
@@ -1599,7 +1750,7 @@ LLVM_READNONE
MCRegister mc2PseudoReg(MCRegister Reg);
LLVM_READNONE
-bool isInlineValue(unsigned Reg);
+bool isInlineValue(MCRegister Reg);
/// Is this an AMDGPU specific source operand? These include registers,
/// inline constants, literals and mandatory literals (KImm).
@@ -1663,6 +1814,7 @@ inline unsigned getOperandSize(const MCOperandInfo &OpInfo) {
case AMDGPU::OPERAND_REG_IMM_V2INT16:
case AMDGPU::OPERAND_REG_IMM_V2BF16:
case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT:
case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
return 2;
@@ -1709,6 +1861,10 @@ LLVM_READNONE
std::optional<unsigned> getInlineEncodingV2F16(uint32_t Literal);
LLVM_READNONE
+std::optional<unsigned> getPKFMACF16InlineEncoding(uint32_t Literal,
+ bool IsGFX11Plus);
+
+LLVM_READNONE
bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType);
LLVM_READNONE
@@ -1721,6 +1877,9 @@ LLVM_READNONE
bool isInlinableLiteralV2F16(uint32_t Literal);
LLVM_READNONE
+bool isPKFMACF16InlineConstant(uint32_t Literal, bool IsGFX11Plus);
+
+LLVM_READNONE
bool isValid32BitLiteral(uint64_t Val, bool IsFP64);
LLVM_READNONE
@@ -1798,16 +1957,16 @@ bool isIntrinsicAlwaysUniform(unsigned IntrID);
/// \returns a register class for the physical register \p Reg if it is a VGPR
/// or nullptr otherwise.
-const MCRegisterClass *getVGPRPhysRegClass(MCPhysReg Reg,
+const MCRegisterClass *getVGPRPhysRegClass(MCRegister Reg,
const MCRegisterInfo &MRI);
/// \returns the MODE bits which have to be set by the S_SET_VGPR_MSB for the
/// physical register \p Reg.
-unsigned getVGPREncodingMSBs(MCPhysReg Reg, const MCRegisterInfo &MRI);
+unsigned getVGPREncodingMSBs(MCRegister Reg, const MCRegisterInfo &MRI);
/// If \p Reg is a low VGPR return a corresponding high VGPR with \p MSBs set.
-MCPhysReg getVGPRWithMSBs(MCPhysReg Reg, unsigned MSBs,
- const MCRegisterInfo &MRI);
+MCRegister getVGPRWithMSBs(MCRegister Reg, unsigned MSBs,
+ const MCRegisterInfo &MRI);
// Returns a table for the opcode with a given \p Desc to map the VGPR MSB
// set by the S_SET_VGPR_MSB to one of 4 sources. In case of VOPD returns 2
@@ -1867,7 +2026,7 @@ private:
Kind AttrKind = Kind::Unknown;
};
-} // end namespace AMDGPU
+} // namespace AMDGPU
raw_ostream &operator<<(raw_ostream &OS,
const AMDGPU::IsaInfo::TargetIDSetting S);
diff --git a/llvm/lib/Target/AMDGPU/VINTERPInstructions.td b/llvm/lib/Target/AMDGPU/VINTERPInstructions.td
index 5e89e34..75437cf 100644
--- a/llvm/lib/Target/AMDGPU/VINTERPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VINTERPInstructions.td
@@ -10,7 +10,7 @@
// VINTERP encoding
//===----------------------------------------------------------------------===//
-class VINTERPe <VOPProfile P> : Enc64 {
+class VINTERPe : Enc64 {
bits<11> vdst;
bits<4> src0_modifiers;
bits<11> src0;
@@ -27,10 +27,10 @@ class VINTERPe <VOPProfile P> : Enc64 {
let Inst{7-0} = vdst{7-0};
let Inst{10-8} = waitexp;
// Fields for hi/lo 16-bits of register selection
- let Inst{11} = !if(P.HasSrc0, src0_modifiers{2}, 0);
- let Inst{12} = !if(P.HasSrc1, src1_modifiers{2}, 0);
- let Inst{13} = !if(P.HasSrc2, src2_modifiers{2}, 0);
- let Inst{14} = !if(P.HasDst, src0_modifiers{3}, 0);
+ let Inst{11} = src0_modifiers{2};
+ let Inst{12} = src1_modifiers{2};
+ let Inst{13} = src2_modifiers{2};
+ let Inst{14} = src0_modifiers{3};
let Inst{15} = clamp;
let Inst{40-32} = src0{8-0};
let Inst{49-41} = src1{8-0};
@@ -40,11 +40,11 @@ class VINTERPe <VOPProfile P> : Enc64 {
let Inst{63} = src2_modifiers{0}; // neg(2)
}
-class VINTERPe_gfx11 <bits<7> op, VOPProfile P> : VINTERPe<P> {
+class VINTERPe_gfx11 <bits<7> op> : VINTERPe {
let Inst{22-16} = op;
}
-class VINTERPe_gfx12 <bits<7> op, VOPProfile P> : VINTERPe<P> {
+class VINTERPe_gfx12 <bits<7> op> : VINTERPe {
let Inst{20-16} = op{4-0};
}
@@ -243,7 +243,7 @@ multiclass VINTERP_Real_gfx11 <bits<7> op, string asmName> {
!if(ps.Pfl.IsRealTrue16, "", "_FAKE16") in {
def _gfx11 :
VINTERP_Real<ps, SIEncodingFamily.GFX11, asmName>,
- VINTERPe_gfx11<op, ps.Pfl>;
+ VINTERPe_gfx11<op>;
}
}
@@ -253,7 +253,7 @@ multiclass VINTERP_Real_gfx12 <bits<7> op, string asmName> {
!if(ps.Pfl.IsRealTrue16, "", "_FAKE16") in {
def _gfx12 :
VINTERP_Real<ps, SIEncodingFamily.GFX12, asmName>,
- VINTERPe_gfx12<op, ps.Pfl>;
+ VINTERPe_gfx12<op>;
}
}
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 54f57e0..56e7623 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -263,16 +263,19 @@ let HasOMod = 0, HasClamp = 0 in {
let isReMaterializable = 1 in {
let SchedRW = [WriteDoubleCvt] in {
// OMod clears exceptions when set in this instruction
+let IsDPMACCInstruction = 1 in
defm V_CVT_I32_F64 : VOP1Inst <"v_cvt_i32_f64", VOP_I32_F64_SPECIAL_OMOD, fp_to_sint>;
let mayRaiseFPException = 0 in {
defm V_CVT_F64_I32 : VOP1Inst <"v_cvt_f64_i32", VOP1_F64_I32, sint_to_fp>;
}
+let IsDPMACCInstruction = 1 in {
defm V_CVT_F32_F64 : VOP1Inst <"v_cvt_f32_f64", VOP_F32_F64, fpround>;
-defm V_CVT_F64_F32 : VOP1Inst <"v_cvt_f64_f32", VOP_F64_F32, any_fpextend>;
// OMod clears exceptions when set in this instruction
defm V_CVT_U32_F64 : VOP1Inst <"v_cvt_u32_f64", VOP_I32_F64_SPECIAL_OMOD, fp_to_uint>;
+} // IsDPMACCInstruction = 1
+defm V_CVT_F64_F32 : VOP1Inst <"v_cvt_f64_f32", VOP_F64_F32, any_fpextend>;
let mayRaiseFPException = 0 in {
defm V_CVT_F64_U32 : VOP1Inst <"v_cvt_f64_u32", VOP1_F64_I32, uint_to_fp>;
@@ -349,11 +352,11 @@ defm V_RSQ_F32 : VOP1Inst <"v_rsq_f32", VOP_F32_F32, AMDGPUrsq>;
defm V_SQRT_F32 : VOP1Inst <"v_sqrt_f32", VOP_F32_F32, int_amdgcn_sqrt>;
} // End TRANS = 1, SchedRW = [WriteTrans32]
-let TRANS = 1, SchedRW = [WriteTrans64] in {
+let TRANS = 1, SchedRW = [WriteTrans64], IsDPMACCInstruction = 1 in {
defm V_RCP_F64 : VOP1Inst <"v_rcp_f64", VOP_F64_F64_NO_DPP, AMDGPUrcp>;
defm V_RSQ_F64 : VOP1Inst <"v_rsq_f64", VOP_F64_F64_NO_DPP, AMDGPUrsq>;
defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64_NO_DPP, int_amdgcn_sqrt>;
-} // End TRANS = 1, SchedRW = [WriteTrans64]
+} // End TRANS = 1, SchedRW = [WriteTrans64], IsDPMACCInstruction = 1
let TRANS = 1, SchedRW = [WriteTrans32] in {
defm V_SIN_F32 : VOP1Inst <"v_sin_f32", VOP_F32_F32, AMDGPUsin>;
@@ -369,18 +372,45 @@ defm V_FFBH_U32 : VOP1Inst <"v_ffbh_u32", VOP_I32_I32, AMDGPUffbh_u32>;
defm V_FFBL_B32 : VOP1Inst <"v_ffbl_b32", VOP_I32_I32, AMDGPUffbl_b32>;
defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32, AMDGPUffbh_i32>;
-let SchedRW = [WriteDoubleAdd] in {
+let SchedRW = [WriteDoubleAdd], IsDPMACCInstruction = 1 in {
defm V_FREXP_EXP_I32_F64 : VOP1Inst <"v_frexp_exp_i32_f64", VOP_I32_F64_SPECIAL_OMOD, int_amdgcn_frexp_exp>;
defm V_FREXP_MANT_F64 : VOP1Inst <"v_frexp_mant_f64", VOP_F64_F64, int_amdgcn_frexp_mant>;
let FPDPRounding = 1 in {
defm V_FRACT_F64 : VOP1Inst <"v_fract_f64", VOP_F64_F64, AMDGPUfract>;
} // End FPDPRounding = 1
-} // End SchedRW = [WriteDoubleAdd]
+} // End SchedRW = [WriteDoubleAdd], IsDPMACCInstruction = 1
defm V_FREXP_EXP_I32_F32 : VOP1Inst <"v_frexp_exp_i32_f32", VOP_I32_F32, int_amdgcn_frexp_exp>;
defm V_FREXP_MANT_F32 : VOP1Inst <"v_frexp_mant_f32", VOP_F32_F32, int_amdgcn_frexp_mant>;
} // End isReMaterializable = 1
+// These i32 conversions naturally saturate.
+def : GCNPat<(i32 (fp_to_uint_sat (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)), i32)),
+ (V_CVT_U32_F32_e64 $src0_modifiers, $src0)>;
+def : GCNPat<(i32 (fp_to_sint_sat (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)), i32)),
+ (V_CVT_I32_F32_e64 $src0_modifiers, $src0)>;
+def : GCNPat<(i32 (fp_to_uint_sat (f64 (VOP3Mods f64:$src0, i32:$src0_modifiers)), i32)),
+ (V_CVT_U32_F64_e64 $src0_modifiers, $src0)>;
+def : GCNPat<(i32 (fp_to_sint_sat (f64 (VOP3Mods f64:$src0, i32:$src0_modifiers)), i32)),
+ (V_CVT_I32_F64_e64 $src0_modifiers, $src0)>;
+def : GCNPat<(i32 (fp_to_uint_sat f32:$src0, i32)), (V_CVT_U32_F32_e32 (f32 $src0))>;
+def : GCNPat<(i32 (fp_to_sint_sat f32:$src0, i32)), (V_CVT_I32_F32_e32 (f32 $src0))>;
+def : GCNPat<(i32 (fp_to_uint_sat f64:$src0, i32)), (V_CVT_U32_F64_e32 (f64 $src0))>;
+def : GCNPat<(i32 (fp_to_sint_sat f64:$src0, i32)), (V_CVT_I32_F64_e32 (f64 $src0))>;
+
+def : GCNPat<(i32 (fp_to_uint_sat_gi (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
+ (V_CVT_U32_F32_e64 $src0_modifiers, $src0)>;
+def : GCNPat<(i32 (fp_to_sint_sat_gi (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
+ (V_CVT_I32_F32_e64 $src0_modifiers, $src0)>;
+def : GCNPat<(i32 (fp_to_uint_sat_gi (f64 (VOP3Mods f64:$src0, i32:$src0_modifiers)))),
+ (V_CVT_U32_F64_e64 $src0_modifiers, $src0)>;
+def : GCNPat<(i32 (fp_to_sint_sat_gi (f64 (VOP3Mods f64:$src0, i32:$src0_modifiers)))),
+ (V_CVT_I32_F64_e64 $src0_modifiers, $src0)>;
+def : GCNPat<(i32 (fp_to_uint_sat_gi f32:$src0)), (V_CVT_U32_F32_e32 (f32 $src0))>;
+def : GCNPat<(i32 (fp_to_sint_sat_gi f32:$src0)), (V_CVT_I32_F32_e32 (f32 $src0))>;
+def : GCNPat<(i32 (fp_to_uint_sat_gi f64:$src0)), (V_CVT_U32_F64_e32 (f64 $src0))>;
+def : GCNPat<(i32 (fp_to_sint_sat_gi f64:$src0)), (V_CVT_I32_F64_e32 (f64 $src0))>;
+
defm V_CLREXCP : VOP1Inst <"v_clrexcp", VOP_NO_EXT<VOP_NONE>>;
// Restrict src0 to be VGPR
@@ -493,12 +523,12 @@ let SubtargetPredicate = isGFX7GFX8GFX9 in {
} // End SubtargetPredicate = isGFX7GFX8GFX9
let SubtargetPredicate = isGFX7Plus in {
- let SchedRW = [WriteDoubleAdd] in {
+ let SchedRW = [WriteDoubleAdd], IsDPMACCInstruction = 1 in {
defm V_TRUNC_F64 : VOP1Inst<"v_trunc_f64", VOP_F64_F64, ftrunc>;
defm V_CEIL_F64 : VOP1Inst<"v_ceil_f64", VOP_F64_F64, fceil>;
defm V_RNDNE_F64 : VOP1Inst<"v_rndne_f64", VOP_F64_F64, froundeven>;
defm V_FLOOR_F64 : VOP1Inst<"v_floor_f64", VOP_F64_F64, ffloor>;
- } // End SchedRW = [WriteDoubleAdd]
+ } // End SchedRW = [WriteDoubleAdd], IsDPMACCInstruction = 1
} // End SubtargetPredicate = isGFX7Plus
} // End isReMaterializable = 1
@@ -513,6 +543,13 @@ defm V_CVT_U16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_u16_f16",
defm V_CVT_I16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_i16_f16",
VOP_I16_F16_SPECIAL_OMOD, VOP_I16_F16_SPECIAL_OMOD_t16, VOP_I16_F16_SPECIAL_OMOD_fake16, fp_to_sint>;
+
+let HasClamp = 0, HasOMod = 0 in {
+def V_TRANS_BF16_Profile : VOPProfile <[bf16, bf16, untyped, untyped]>;
+def V_TRANS_BF16_t16_Profile : VOPProfile_True16 <VOP_BF16_BF16>;
+def V_TRANS_BF16_fake16_Profile : VOPProfile_Fake16 <VOP_BF16_BF16>;
+}
+
let TRANS = 1, SchedRW = [WriteTrans32] in {
defm V_RCP_F16 : VOP1Inst_t16 <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>;
defm V_SQRT_F16 : VOP1Inst_t16 <"v_sqrt_f16", VOP_F16_F16, any_amdgcn_sqrt>;
@@ -527,14 +564,30 @@ defm V_TANH_F16 : VOP1Inst_t16 <"v_tanh_f16", VOP_F16_F16, int_amdgcn_tanh>;
}
let SubtargetPredicate = HasBF16TransInsts in {
-defm V_TANH_BF16 : VOP1Inst_t16 <"v_tanh_bf16", VOP_BF16_BF16, int_amdgcn_tanh>;
-defm V_RCP_BF16 : VOP1Inst_t16 <"v_rcp_bf16", VOP_BF16_BF16, AMDGPUrcp>;
-defm V_SQRT_BF16 : VOP1Inst_t16 <"v_sqrt_bf16", VOP_BF16_BF16, any_amdgcn_sqrt>;
-defm V_RSQ_BF16 : VOP1Inst_t16 <"v_rsq_bf16", VOP_BF16_BF16, AMDGPUrsq>;
-defm V_LOG_BF16 : VOP1Inst_t16 <"v_log_bf16", VOP_BF16_BF16, AMDGPUlogf16>;
-defm V_EXP_BF16 : VOP1Inst_t16 <"v_exp_bf16", VOP_BF16_BF16, AMDGPUexpf16>;
-defm V_SIN_BF16 : VOP1Inst_t16 <"v_sin_bf16", VOP_BF16_BF16, AMDGPUsin>;
-defm V_COS_BF16 : VOP1Inst_t16 <"v_cos_bf16", VOP_BF16_BF16, AMDGPUcos>;
+defm V_TANH_BF16 : VOP1Inst_t16_with_profiles<"v_tanh_bf16", V_TRANS_BF16_Profile,
+ V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile,
+ int_amdgcn_tanh>;
+defm V_RCP_BF16 : VOP1Inst_t16_with_profiles<"v_rcp_bf16", V_TRANS_BF16_Profile,
+ V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile,
+ AMDGPUrcp>;
+defm V_SQRT_BF16 : VOP1Inst_t16_with_profiles<"v_sqrt_bf16", V_TRANS_BF16_Profile,
+ V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile,
+ any_amdgcn_sqrt>;
+defm V_RSQ_BF16 : VOP1Inst_t16_with_profiles<"v_rsq_bf16", V_TRANS_BF16_Profile,
+ V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile,
+ AMDGPUrsq>;
+defm V_LOG_BF16 : VOP1Inst_t16_with_profiles<"v_log_bf16", V_TRANS_BF16_Profile,
+ V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile,
+ AMDGPUlogf16>;
+defm V_EXP_BF16 : VOP1Inst_t16_with_profiles<"v_exp_bf16", V_TRANS_BF16_Profile,
+ V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile,
+ AMDGPUexpf16>;
+defm V_SIN_BF16 : VOP1Inst_t16_with_profiles<"v_sin_bf16", V_TRANS_BF16_Profile,
+ V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile,
+ AMDGPUsin>;
+defm V_COS_BF16 : VOP1Inst_t16_with_profiles<"v_cos_bf16", V_TRANS_BF16_Profile,
+ V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile,
+ AMDGPUcos>;
}
} // End TRANS = 1, SchedRW = [WriteTrans32]
defm V_FREXP_MANT_F16 : VOP1Inst_t16 <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>;
@@ -593,15 +646,15 @@ let SubtargetPredicate = isGFX9Plus in {
let isReMaterializable = 1 in
defm V_SAT_PK_U8_I16 : VOP1Inst_t16<"v_sat_pk_u8_i16", VOP_I16_I32>;
-
- let mayRaiseFPException = 0 in {
- defm V_CVT_NORM_I16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_norm_i16_f16",
- VOP_I16_F16_SPECIAL_OMOD, VOP_I16_F16_SPECIAL_OMOD_t16, VOP_I16_F16_SPECIAL_OMOD_fake16>;
- defm V_CVT_NORM_U16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_norm_u16_f16",
- VOP_I16_F16_SPECIAL_OMOD, VOP_I16_F16_SPECIAL_OMOD_t16, VOP_I16_F16_SPECIAL_OMOD_fake16>;
- } // End mayRaiseFPException = 0
} // End SubtargetPredicate = isGFX9Plus
+let mayRaiseFPException = 0, SubtargetPredicate = HasCvtNormInsts in {
+defm V_CVT_NORM_I16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_norm_i16_f16",
+ VOP_I16_F16_SPECIAL_OMOD, VOP_I16_F16_SPECIAL_OMOD_t16, VOP_I16_F16_SPECIAL_OMOD_fake16>;
+defm V_CVT_NORM_U16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_norm_u16_f16",
+ VOP_I16_F16_SPECIAL_OMOD, VOP_I16_F16_SPECIAL_OMOD_t16, VOP_I16_F16_SPECIAL_OMOD_fake16>;
+} // End mayRaiseFPException = 0, SubtargetPredicate = HasCvtNormInsts
+
let SubtargetPredicate = isGFX9Only in {
defm V_SCREEN_PARTITION_4SE_B32 : VOP1Inst <"v_screen_partition_4se_b32", VOP_I32_I32>;
} // End SubtargetPredicate = isGFX9Only
@@ -644,7 +697,7 @@ let OtherPredicates = [HasCvtFP8VOP1Bug] in {
(V_CVT_F32_BF8_sdwa 0, $src, 0, 0, 0)>;
}
-let OtherPredicates = [HasNoCvtFP8VOP1Bug, HasSDWA] in { // FIXME: HasSDWA is a substitute for !gfx12
+let OtherPredicates = [NotHasCvtFP8VOP1Bug, HasSDWA] in { // FIXME: HasSDWA is a substitute for !gfx12
def : GCNPat<(f32 (int_amdgcn_cvt_f32_fp8 i32:$src, 0)),
(V_CVT_F32_FP8_e32 $src)>;
def : GCNPat<(f32 (int_amdgcn_cvt_f32_bf8 i32:$src, 0)),
@@ -707,9 +760,9 @@ def V_CVT_F16_F8_True16_Profile : VOP3_Profile_True16<V_CVT_F16_F8_Profile>;
def V_CVT_F16_F8_Fake16_Profile : VOP3_Profile_Fake16<V_CVT_F16_F8_Profile>;
}
-let SubtargetPredicate = isGFX12Plus, OtherPredicates = [HasFP8ConversionInsts],
+let SubtargetPredicate = isGFX11Plus, OtherPredicates = [HasFP8ConversionInsts],
mayRaiseFPException = 0, SchedRW = [WriteFloatCvt] in {
- let SubtargetPredicate = isGFX12PlusNot12_50 in
+ let SubtargetPredicate = isGFX11PlusNot12_50 in
defm V_CVT_F32_FP8_OP_SEL : VOP1Inst<"v_cvt_f32_fp8_op_sel", VOPProfile_Base_CVT_F_F8_ByteSel<f32>>;
let SubtargetPredicate = isGFX125xOnly in
defm V_CVT_F32_FP8_gfx1250 : VOP1Inst<"v_cvt_f32_fp8_gfx1250", VOPProfile_Base_CVT_F_F8_ByteSel<f32, 1>>;
@@ -733,7 +786,7 @@ class Cvt_F_F8_Pat_ByteSel<SDPatternOperator node, VOP3_Pseudo inst, bit HasOpSe
>;
let OtherPredicates = [HasFP8ConversionInsts] in {
- let SubtargetPredicate = isGFX12PlusNot12_50 in
+ let SubtargetPredicate = isGFX11PlusNot12_50 in
def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f32_fp8, V_CVT_F32_FP8_OP_SEL_e64>;
let SubtargetPredicate = isGFX125xOnly in {
def : GCNPat<(int_amdgcn_cvt_f32_fp8 i32:$src0, timm:$byte_sel),
@@ -741,7 +794,7 @@ let OtherPredicates = [HasFP8ConversionInsts] in {
def : GCNPat<(int_amdgcn_cvt_f32_fp8_e5m3 i32:$src0, timm:$byte_sel),
(V_CVT_F32_FP8_gfx1250_e64 $src0, DSTCLAMP.ENABLE, (as_i32timm $byte_sel))>;
}
- let SubtargetPredicate = isGFX12Plus in
+ let SubtargetPredicate = isGFX11Plus in
def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f32_bf8, V_CVT_F32_BF8_OP_SEL_e64>;
}
@@ -753,7 +806,7 @@ class Cvt_PK_F32_F8_Pat_OpSel<SDPatternOperator node, int index,
(inst_e32 $src))
>;
-let SubtargetPredicate = isGFX12Plus, OtherPredicates = [HasFP8ConversionInsts] in {
+let SubtargetPredicate = isGFX11Plus, OtherPredicates = [HasFP8ConversionInsts] in {
foreach Index = [0, -1] in {
def : Cvt_PK_F32_F8_Pat_OpSel<int_amdgcn_cvt_pk_f32_fp8, Index,
V_CVT_PK_F32_FP8_fake16_e32, V_CVT_PK_F32_FP8_fake16_e64>;
@@ -839,7 +892,7 @@ let SubtargetPredicate = isGFX11Plus in {
// Restrict src0 to be VGPR
def V_PERMLANE64_B32 : VOP1_Pseudo<"v_permlane64_b32", VOP_MOVRELS,
[], /*VOP1Only=*/ 1>;
- let isAsCheapAsAMove = 1 in
+ let isAsCheapAsAMove = 1, isMoveImm = 1 in
defm V_MOV_B16 : VOP1Inst_t16<"v_mov_b16", VOP_I16_I16>;
defm V_NOT_B16 : VOP1Inst_t16<"v_not_b16", VOP_I16_I16>;
defm V_CVT_I32_I16 : VOP1Inst_t16<"v_cvt_i32_i16", VOP_I32_I16>;
@@ -927,7 +980,7 @@ class VOP1_DPP8_Gen<bits<8> op, VOP1_Pseudo ps, GFXGen Gen, VOPProfile p = ps.Pf
}
//===----------------------------------------------------------------------===//
-// GFX11, GFX12
+// GFX11, GFX12, GFX13
//===----------------------------------------------------------------------===//
multiclass VOP1Only_Real<GFXGen Gen, bits<9> op> {
@@ -1001,10 +1054,19 @@ multiclass VOP1_Realtriple_e64_with_name<GFXGen Gen, bits<9> op, string opName,
asmName>;
}
+multiclass VOP1_Realtriple_e64_with_name_gfx12_gfx13<
+ bits<9> op, string opName, string asmName> :
+ VOP1_Realtriple_e64_with_name<GFX12Gen, op, opName, asmName>,
+ VOP1_Realtriple_e64_with_name<GFX13Gen, op, opName, asmName>;
+
multiclass VOP1_Real_FULL<GFXGen Gen, bits<9> op> :
VOP1_Real_e32<Gen, op>, VOP1_Realtriple_e64<Gen, op>,
VOP1_Real_dpp<Gen, op>, VOP1_Real_dpp8<Gen, op>;
+multiclass VOP1_Real_FULL_gfx1250_gfx13<bits<9> op> :
+ VOP1_Real_FULL<GFX1250Gen, op>,
+ VOP1_Real_FULL<GFX13Gen, op>;
+
multiclass VOP1_Real_NO_VOP3_with_name_gfx11<bits<9> op, string opName,
string asmName> {
defm NAME : VOP1_Real_e32_with_name<GFX11Gen, op, opName, asmName>,
@@ -1016,11 +1078,14 @@ multiclass VOP1_Real_NO_VOP3_with_name_gfx11<bits<9> op, string opName,
}
}
-multiclass VOP1_Real_NO_VOP3_with_name_gfx12<bits<9> op, string opName,
- string asmName> {
+multiclass VOP1_Real_NO_VOP3_with_name_gfx12_gfx13<
+ bits<9> op, string opName, string asmName> {
defm NAME : VOP1_Real_e32_with_name<GFX12Gen, op, opName, asmName>,
VOP1_Real_dpp_with_name<GFX12Gen, op, opName, asmName>,
VOP1_Real_dpp8_with_name<GFX12Gen, op, opName, asmName>;
+ defm NAME : VOP1_Real_e32_with_name<GFX13Gen, op, opName, asmName>,
+ VOP1_Real_dpp_with_name<GFX13Gen, op, opName, asmName>,
+ VOP1_Real_dpp8_with_name<GFX13Gen, op, opName, asmName>;
}
multiclass VOP1_Real_FULL_with_name<GFXGen Gen, bits<9> op, string opName,
@@ -1030,6 +1095,11 @@ multiclass VOP1_Real_FULL_with_name<GFXGen Gen, bits<9> op, string opName,
VOP1_Real_dpp8_with_name<Gen, op, opName, asmName>,
VOP1_Realtriple_e64_with_name<Gen, op, opName, asmName>;
+multiclass VOP1_Real_FULL_with_name_gfx1250_gfx13<
+ bits<9> op, string opName, string asmName> :
+ VOP1_Real_FULL_with_name<GFX1250Gen, op, opName, asmName>,
+ VOP1_Real_FULL_with_name<GFX13Gen, op, opName, asmName>;
+
multiclass VOP1_Real_NO_DPP<GFXGen Gen, bits<9> op> :
VOP1_Real_e32<Gen, op>, VOP1_Real_e64<Gen, op>;
@@ -1038,134 +1108,159 @@ multiclass VOP1_Real_with_DPP16<GFXGen Gen, bits<9> op> :
VOP1_Real_dpp<Gen, op>,
VOP3_Real_dpp_Base<Gen, {0, 1, 1, op{6-0}}>;
-multiclass VOP1_Real_FULL_t16_gfx11_gfx12<bits<9> op, string asmName,
- string opName = NAME> :
+multiclass VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<
+ bits<9> op, string asmName, string opName = NAME> :
VOP1_Real_FULL_with_name<GFX11Gen, op, opName, asmName>,
- VOP1_Real_FULL_with_name<GFX12Gen, op, opName, asmName>;
+ VOP1_Real_FULL_with_name<GFX12Gen, op, opName, asmName>,
+ VOP1_Real_FULL_with_name<GFX13Gen, op, opName, asmName>;
+
+multiclass VOP1_Real_FULL_with_name_gfx12_gfx13<
+ bits<9> op, string opName, string asmName> :
+ VOP1_Real_FULL_with_name<GFX12Gen, op, opName, asmName>,
+ VOP1_Real_FULL_with_name<GFX13Gen, op, opName, asmName>;
-multiclass VOP1_Real_FULL_with_name_gfx11_gfx12<bits<9> op, string opName,
- string asmName> :
+multiclass VOP1_Real_FULL_with_name_gfx11_gfx12_gfx13<
+ bits<9> op, string opName, string asmName> :
VOP1_Real_FULL_with_name<GFX11Gen, op, opName, asmName>,
- VOP1_Real_FULL_with_name<GFX12Gen, op, opName, asmName>;
+ VOP1_Real_FULL_with_name<GFX12Gen, op, opName, asmName>,
+ VOP1_Real_FULL_with_name<GFX13Gen, op, opName, asmName>;
-multiclass VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<
+multiclass VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<
bits<9> op, string asmName = !tolower(NAME), string opName = NAME> {
defm opName#"_t16" :
- VOP1_Real_FULL_with_name_gfx11_gfx12<op, opName#"_t16", asmName>;
+ VOP1_Real_FULL_with_name_gfx11_gfx12_gfx13<op, opName#"_t16", asmName>;
defm opName#"_fake16":
- VOP1_Real_FULL_with_name_gfx11_gfx12<op, opName#"_fake16", asmName>;
+ VOP1_Real_FULL_with_name_gfx11_gfx12_gfx13<op, opName#"_fake16", asmName>;
}
-multiclass VOP1Only_Real_gfx11_gfx12<bits<9> op> :
- VOP1Only_Real<GFX11Gen, op>, VOP1Only_Real<GFX12Gen, op>;
+multiclass VOP1Only_Real_gfx11_gfx12_gfx13<bits<9> op> :
+ VOP1Only_Real<GFX11Gen, op>, VOP1Only_Real<GFX12Gen, op>,
+ VOP1Only_Real<GFX13Gen, op>;
multiclass VOP1_Real_FULL_gfx11_gfx12<bits<9> op> :
VOP1_Real_FULL<GFX11Gen, op>, VOP1_Real_FULL<GFX12Gen, op>;
-multiclass VOP1_Real_FULL_t16_and_fake16_gfx1250<
+multiclass VOP1_Real_e32_with_name_gfx11_gfx12_gfx13<bits<9> op, string opName,
+ string asmName> :
+ VOP1_Real_e32_with_name<GFX11Gen, op, opName, asmName>,
+ VOP1_Real_e32_with_name<GFX12Gen, op, opName, asmName>,
+ VOP1_Real_e32_with_name<GFX13Gen, op, opName, asmName>;
+
+multiclass VOP1_Real_FULL_t16<GFXGen Gen, bits<9> op> :
+ VOP1_Real_FULL_with_name<Gen, op, NAME,
+ !cast<VOP1_Pseudo>(!subst("_fake16", "", NAME)#"_e32").Mnemonic>;
+
+multiclass VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<
bits<9> op, string asmName = !tolower(NAME), string opName = NAME> {
defm opName#"_t16" :
- VOP1_Real_FULL_with_name<GFX1250Gen, op, opName#"_t16", asmName>;
+ VOP1_Real_FULL_with_name<GFX1250Gen, op, opName#"_t16", asmName>,
+ VOP1_Real_FULL_with_name<GFX13Gen, op, opName#"_t16", asmName>;
defm opName#"_fake16":
- VOP1_Real_FULL_with_name<GFX1250Gen, op, opName#"_fake16", asmName>;
+ VOP1_Real_FULL_with_name<GFX1250Gen, op, opName#"_fake16", asmName>,
+ VOP1_Real_FULL_with_name<GFX13Gen, op, opName#"_fake16", asmName>;
}
-multiclass VOP1_Real_FULL_with_name_gfx11_gfx12_not_gfx1250<bits<9> op, string opName,
- string asmName> :
+multiclass VOP1_Real_FULL_with_name_gfx11_gfx12_gfx13_not_gfx1250<bits<9> op, string opName,
+ string asmName> :
VOP1_Real_FULL_with_name<GFX11Gen, op, opName, asmName>,
- VOP1_Real_FULL_with_name<GFX12Not12_50Gen, op, opName, asmName>;
+ VOP1_Real_FULL_with_name<GFX12Not12_50Gen, op, opName, asmName>,
+ VOP1_Real_FULL_with_name<GFX13Gen, op, opName, asmName>;
-multiclass VOP1_Real_OpSelIsDPP_gfx1250<bits<9> op> : VOP1_Real_e32<GFX1250Gen, op> {
+multiclass VOP1_Real_OpSelIsDPP<GFXGen Gen, bits<9> op> : VOP1_Real_e32<Gen, op> {
defvar ps = !cast<VOP_Pseudo>(NAME#"_e64");
- def _e64_gfx1250 :
- VOP3_Real_Gen<ps, GFX1250Gen>,
+ def _e64#Gen.Suffix :
+ VOP3_Real_Gen<ps, Gen>,
VOP3OpSelIsDPP_gfx12<{0, 1, 1, op{6-0}}, ps.Pfl>;
}
-defm V_CVT_F32_FP8 : VOP1_Real_FULL_with_name_gfx11_gfx12_not_gfx1250<0x06c, "V_CVT_F32_FP8_OP_SEL", "v_cvt_f32_fp8">;
+multiclass VOP1_Real_OpSelIsDPP_gfx1250_gfx13<bits<9> op> :
+ VOP1_Real_OpSelIsDPP<GFX1250Gen, op>,
+ VOP1_Real_OpSelIsDPP<GFX13Gen, op>;
+
+defm V_CVT_F32_FP8 : VOP1_Real_FULL_with_name_gfx11_gfx12_gfx13_not_gfx1250<0x06c, "V_CVT_F32_FP8_OP_SEL", "v_cvt_f32_fp8">;
defm V_CVT_F32_FP8 : VOP1_Real_FULL_with_name<GFX1250Gen, 0x06c, "V_CVT_F32_FP8_gfx1250", "v_cvt_f32_fp8">;
-defm V_CVT_F32_BF8 : VOP1_Real_FULL_with_name<GFX12Gen, 0x06d, "V_CVT_F32_BF8_OP_SEL", "v_cvt_f32_bf8">;
+defm V_CVT_F32_BF8 : VOP1_Real_FULL_with_name_gfx11_gfx12_gfx13<0x06d, "V_CVT_F32_BF8_OP_SEL", "v_cvt_f32_bf8">;
-defm V_CVT_PK_F32_FP8_fake16 : VOP1_Real_e32_with_name<GFX12Gen, 0x06e, "V_CVT_PK_F32_FP8_fake16", "v_cvt_pk_f32_fp8">;
-defm V_CVT_PK_F32_FP8_t16 : VOP1_Real_e32_with_name<GFX12Gen, 0x06e, "V_CVT_PK_F32_FP8_t16", "v_cvt_pk_f32_fp8">;
-defm V_CVT_PK_F32_FP8_fake16 : VOP3_Real_with_name<GFX12Gen, 0x1ee, "V_CVT_PK_F32_FP8_fake16", "v_cvt_pk_f32_fp8">;
-defm V_CVT_PK_F32_FP8_t16 : VOP3_Real_with_name<GFX12Gen, 0x1ee, "V_CVT_PK_F32_FP8_t16", "v_cvt_pk_f32_fp8">;
-defm V_CVT_PK_F32_BF8_fake16 : VOP1_Real_e32_with_name<GFX12Gen, 0x06f, "V_CVT_PK_F32_BF8_fake16", "v_cvt_pk_f32_bf8">;
-defm V_CVT_PK_F32_BF8_t16 : VOP1_Real_e32_with_name<GFX12Gen, 0x06f, "V_CVT_PK_F32_BF8_t16", "v_cvt_pk_f32_bf8">;
-defm V_CVT_PK_F32_BF8_fake16 : VOP3_Real_with_name<GFX12Gen, 0x1ef, "V_CVT_PK_F32_BF8_fake16", "v_cvt_pk_f32_bf8">;
-defm V_CVT_PK_F32_BF8_t16 : VOP3_Real_with_name<GFX12Gen, 0x1ef, "V_CVT_PK_F32_BF8_t16", "v_cvt_pk_f32_bf8">;
+defm V_CVT_PK_F32_FP8_fake16 : VOP1_Real_e32_with_name_gfx11_gfx12_gfx13<0x06e, "V_CVT_PK_F32_FP8_fake16", "v_cvt_pk_f32_fp8">;
+defm V_CVT_PK_F32_FP8_t16 : VOP1_Real_e32_with_name_gfx11_gfx12_gfx13<0x06e, "V_CVT_PK_F32_FP8_t16", "v_cvt_pk_f32_fp8">;
+defm V_CVT_PK_F32_FP8_fake16 : VOP3_Real_with_name_gfx11_gfx12_gfx13<0x1ee, "V_CVT_PK_F32_FP8_fake16", "v_cvt_pk_f32_fp8">;
+defm V_CVT_PK_F32_FP8_t16 : VOP3_Real_with_name_gfx11_gfx12_gfx13<0x1ee, "V_CVT_PK_F32_FP8_t16", "v_cvt_pk_f32_fp8">;
+defm V_CVT_PK_F32_BF8_fake16 : VOP1_Real_e32_with_name_gfx11_gfx12_gfx13<0x06f, "V_CVT_PK_F32_BF8_fake16", "v_cvt_pk_f32_bf8">;
+defm V_CVT_PK_F32_BF8_t16 : VOP1_Real_e32_with_name_gfx11_gfx12_gfx13<0x06f, "V_CVT_PK_F32_BF8_t16", "v_cvt_pk_f32_bf8">;
+defm V_CVT_PK_F32_BF8_fake16 : VOP3_Real_with_name_gfx11_gfx12_gfx13<0x1ef, "V_CVT_PK_F32_BF8_fake16", "v_cvt_pk_f32_bf8">;
+defm V_CVT_PK_F32_BF8_t16 : VOP3_Real_with_name_gfx11_gfx12_gfx13<0x1ef, "V_CVT_PK_F32_BF8_t16", "v_cvt_pk_f32_bf8">;
-defm V_CVT_NEAREST_I32_F32 : VOP1_Real_FULL_with_name_gfx11_gfx12<0x00c,
+defm V_CVT_NEAREST_I32_F32 : VOP1_Real_FULL_with_name_gfx11_gfx12_gfx13<0x00c,
"V_CVT_RPI_I32_F32", "v_cvt_nearest_i32_f32">;
-defm V_CVT_FLOOR_I32_F32 : VOP1_Real_FULL_with_name_gfx11_gfx12<0x00d,
+defm V_CVT_FLOOR_I32_F32 : VOP1_Real_FULL_with_name_gfx11_gfx12_gfx13<0x00d,
"V_CVT_FLR_I32_F32", "v_cvt_floor_i32_f32">;
-defm V_CLZ_I32_U32 : VOP1_Real_FULL_with_name_gfx11_gfx12<0x039,
+defm V_CLZ_I32_U32 : VOP1_Real_FULL_with_name_gfx11_gfx12_gfx13<0x039,
"V_FFBH_U32", "v_clz_i32_u32">;
-defm V_CTZ_I32_B32 : VOP1_Real_FULL_with_name_gfx11_gfx12<0x03a,
+defm V_CTZ_I32_B32 : VOP1_Real_FULL_with_name_gfx11_gfx12_gfx13<0x03a,
"V_FFBL_B32", "v_ctz_i32_b32">;
-defm V_CLS_I32 : VOP1_Real_FULL_with_name_gfx11_gfx12<0x03b,
+defm V_CLS_I32 : VOP1_Real_FULL_with_name_gfx11_gfx12_gfx13<0x03b,
"V_FFBH_I32", "v_cls_i32">;
-defm V_SWAP_B16 : VOP1Only_Real_gfx11_gfx12<0x066>;
-defm V_PERMLANE64_B32 : VOP1Only_Real_gfx11_gfx12<0x067>;
-defm V_MOV_B16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x01c, "v_mov_b16">;
-defm V_NOT_B16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x069>;
-defm V_CVT_I32_I16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x06a>;
-defm V_CVT_U32_U16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x06b>;
-
-defm V_CVT_F16_U16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x050>;
-defm V_CVT_F16_I16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x051>;
-defm V_CVT_U16_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x052>;
-defm V_CVT_I16_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x053>;
-defm V_RCP_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x054, "v_rcp_f16">;
-defm V_RCP_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x054, "v_rcp_f16">;
-defm V_SQRT_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x055, "v_sqrt_f16">;
-defm V_SQRT_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x055, "v_sqrt_f16">;
-defm V_RSQ_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x056, "v_rsq_f16">;
-defm V_RSQ_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x056, "v_rsq_f16">;
-defm V_LOG_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x057, "v_log_f16">;
-defm V_LOG_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x057, "v_log_f16">;
-defm V_EXP_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x058, "v_exp_f16">;
-defm V_EXP_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x058, "v_exp_f16">;
-defm V_FREXP_MANT_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x059>;
-defm V_FREXP_EXP_I16_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x05a>;
-defm V_FLOOR_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05b, "v_floor_f16">;
-defm V_FLOOR_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05b, "v_floor_f16">;
-defm V_CEIL_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05c, "v_ceil_f16">;
-defm V_CEIL_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05c, "v_ceil_f16">;
-defm V_TRUNC_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x05d>;
-defm V_RNDNE_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x05e>;
-defm V_FRACT_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x05f>;
-defm V_SIN_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x060>;
-defm V_COS_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x061>;
-defm V_SAT_PK_U8_I16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x062>;
-defm V_CVT_NORM_I16_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x063>;
-defm V_CVT_NORM_U16_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x064>;
-
-defm V_CVT_F16_F32 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x00a>;
-defm V_CVT_F32_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x00b>;
+defm V_SWAP_B16 : VOP1Only_Real_gfx11_gfx12_gfx13<0x066>;
+defm V_PERMLANE64_B32 : VOP1Only_Real_gfx11_gfx12_gfx13<0x067>;
+defm V_MOV_B16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<0x01c, "v_mov_b16">;
+defm V_NOT_B16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x069>;
+defm V_CVT_I32_I16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x06a>;
+defm V_CVT_U32_U16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x06b>;
+
+defm V_CVT_F16_U16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x050>;
+defm V_CVT_F16_I16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x051>;
+defm V_CVT_U16_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x052>;
+defm V_CVT_I16_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x053>;
+defm V_RCP_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<0x054, "v_rcp_f16">;
+defm V_RCP_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<0x054, "v_rcp_f16">;
+defm V_SQRT_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<0x055, "v_sqrt_f16">;
+defm V_SQRT_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<0x055, "v_sqrt_f16">;
+defm V_RSQ_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<0x056, "v_rsq_f16">;
+defm V_RSQ_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<0x056, "v_rsq_f16">;
+defm V_LOG_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<0x057, "v_log_f16">;
+defm V_LOG_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<0x057, "v_log_f16">;
+defm V_EXP_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<0x058, "v_exp_f16">;
+defm V_EXP_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<0x058, "v_exp_f16">;
+defm V_FREXP_MANT_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x059>;
+defm V_FREXP_EXP_I16_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x05a>;
+defm V_FLOOR_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<0x05b, "v_floor_f16">;
+defm V_FLOOR_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<0x05b, "v_floor_f16">;
+defm V_CEIL_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<0x05c, "v_ceil_f16">;
+defm V_CEIL_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<0x05c, "v_ceil_f16">;
+defm V_TRUNC_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x05d>;
+defm V_RNDNE_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x05e>;
+defm V_FRACT_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x05f>;
+defm V_SIN_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x060>;
+defm V_COS_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x061>;
+defm V_SAT_PK_U8_I16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x062>;
+defm V_CVT_NORM_I16_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x063>;
+defm V_CVT_NORM_U16_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x064>;
+
+defm V_CVT_F16_F32 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x00a>;
+defm V_CVT_F32_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x00b>;
defm V_MOV_B64 : VOP1_Real_FULL <GFX1250Gen, 0x1d>;
-defm V_TANH_F32 : VOP1_Real_FULL<GFX1250Gen, 0x01e>;
-defm V_TANH_F16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x01f>;
-defm V_PERMLANE16_SWAP_B32 : VOP1_Real_OpSelIsDPP_gfx1250<0x049>;
-defm V_TANH_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x04a>;
-defm V_PRNG_B32 : VOP1_Real_FULL<GFX1250Gen, 0x04b>;
-defm V_CVT_F32_BF16_gfx1250 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16">;
-defm V_SAT_PK4_I4_I8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x073>;
-defm V_SAT_PK4_U4_U8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x074>;
-defm V_CVT_PK_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x075>;
-defm V_CVT_PK_F16_BF8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x076>;
-defm V_CVT_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x077>;
-defm V_CVT_F16_BF8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x078>;
-defm V_RCP_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x079>;
-defm V_SQRT_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07a>;
-defm V_RSQ_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07b>;
-defm V_LOG_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07c>;
-defm V_EXP_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07d>;
-defm V_SIN_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07e>;
-defm V_COS_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07f>;
+defm V_TANH_F32 : VOP1_Real_FULL_gfx1250_gfx13<0x01e>;
+defm V_TANH_F16 : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x01f>;
+defm V_PERMLANE16_SWAP_B32 : VOP1_Real_OpSelIsDPP_gfx1250_gfx13<0x049>;
+defm V_TANH_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x04a>;
+defm V_PRNG_B32 : VOP1_Real_FULL_gfx1250_gfx13<0x04b>;
+defm V_CVT_F32_BF16_gfx1250 : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x072, "v_cvt_f32_bf16">;
+defm V_SAT_PK4_I4_I8 : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x073>;
+defm V_SAT_PK4_U4_U8 : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x074>;
+defm V_CVT_PK_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x075>;
+defm V_CVT_PK_F16_BF8 : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x076>;
+defm V_CVT_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x077>;
+defm V_CVT_F16_BF8 : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x078>;
+defm V_RCP_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x079>;
+defm V_SQRT_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x07a>;
+defm V_RSQ_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x07b>;
+defm V_LOG_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x07c>;
+defm V_EXP_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x07d>;
+defm V_SIN_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x07e>;
+defm V_COS_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x07f>;
//===----------------------------------------------------------------------===//
// GFX10.
@@ -1213,17 +1308,22 @@ multiclass VOP1_Real_gfx10_FULL_gfx11_gfx12<bits<9> op> :
VOP1_Real_FULL<GFX11Gen, op>,
VOP1_Real_FULL<GFX12Gen, op>;
-multiclass VOP1_Real_gfx10_NO_DPP_gfx11_gfx12<bits<9> op> :
+multiclass VOP1_Real_gfx10_NO_DPP_gfx11_gfx12_gfx13<bits<9> op> :
VOP1_Real_gfx10<op>,
VOP1_Real_NO_DPP<GFX11Gen, op>,
- VOP1_Real_NO_DPP<GFX12Gen, op>;
+ VOP1_Real_NO_DPP<GFX12Gen, op>,
+ VOP1_Real_NO_DPP<GFX13Gen, op>;
multiclass VOP1Only_Real_gfx10_gfx11_gfx12<bits<9> op> :
VOP1Only_Real_gfx10<op>,
VOP1Only_Real<GFX11Gen, op>,
VOP1Only_Real<GFX12Gen, op>;
-defm V_PIPEFLUSH : VOP1_Real_gfx10_NO_DPP_gfx11_gfx12<0x01b>;
+multiclass VOP1Only_Real_gfx10_gfx11_gfx12_gfx13<bits<9> op> :
+ VOP1Only_Real_gfx10_gfx11_gfx12<op>,
+ VOP1Only_Real<GFX13Gen, op>;
+
+defm V_PIPEFLUSH : VOP1_Real_gfx10_NO_DPP_gfx11_gfx12_gfx13<0x01b>;
defm V_MOVRELSD_2_B32 : VOP1_Real_gfx10_FULL_gfx11_gfx12<0x048>;
defm V_CVT_F16_U16 : VOP1_Real_gfx10<0x050>;
defm V_CVT_F16_I16 : VOP1_Real_gfx10<0x051>;
@@ -1247,7 +1347,7 @@ defm V_SAT_PK_U8_I16 : VOP1_Real_gfx10<0x062>;
defm V_CVT_NORM_I16_F16 : VOP1_Real_gfx10<0x063>;
defm V_CVT_NORM_U16_F16 : VOP1_Real_gfx10<0x064>;
-defm V_SWAP_B32 : VOP1Only_Real_gfx10_gfx11_gfx12<0x065>;
+defm V_SWAP_B32 : VOP1Only_Real_gfx10_gfx11_gfx12_gfx13<0x065>;
defm V_SWAPREL_B32 : VOP1Only_Real_gfx10_gfx11_gfx12<0x068>;
//===----------------------------------------------------------------------===//
@@ -1270,20 +1370,20 @@ let AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7" in {
multiclass VOP1_Real_gfx7<bits<9> op> :
VOP1_Real_e32_gfx7<op>, VOP1_Real_e64_gfx7<op>;
-multiclass VOP1_Real_gfx7_gfx10_NO_DPP_gfx11_with_DPP16_gfx12<bits<9> op> :
+multiclass VOP1_Real_gfx7_gfx10_NO_DPP_gfx11_gfx13_with_DPP16_gfx12<bits<9> op> :
VOP1_Real_gfx7<op>, VOP1_Real_gfx10<op>, VOP1_Real_NO_DPP<GFX11Gen, op>,
- VOP1_Real_with_DPP16<GFX12Gen, op>;
+ VOP1_Real_with_DPP16<GFX12Gen, op>, VOP1_Real_NO_DPP<GFX13Gen, op>;
defm V_LOG_LEGACY_F32 : VOP1_Real_gfx7<0x045>;
defm V_EXP_LEGACY_F32 : VOP1_Real_gfx7<0x046>;
-defm V_TRUNC_F64 : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11_with_DPP16_gfx12<0x017>;
-defm V_CEIL_F64 : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11_with_DPP16_gfx12<0x018>;
-defm V_RNDNE_F64 : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11_with_DPP16_gfx12<0x019>;
-defm V_FLOOR_F64 : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11_with_DPP16_gfx12<0x01a>;
+defm V_TRUNC_F64 : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11_gfx13_with_DPP16_gfx12<0x017>;
+defm V_CEIL_F64 : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11_gfx13_with_DPP16_gfx12<0x018>;
+defm V_RNDNE_F64 : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11_gfx13_with_DPP16_gfx12<0x019>;
+defm V_FLOOR_F64 : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11_gfx13_with_DPP16_gfx12<0x01a>;
//===----------------------------------------------------------------------===//
-// GFX6, GFX7, GFX10, GFX11, GFX12
+// GFX6, GFX7, GFX10, GFX11, GFX12, GFX13
//===----------------------------------------------------------------------===//
let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in {
@@ -1314,16 +1414,20 @@ multiclass VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<bits<9> op> :
VOP1_Real_gfx6_gfx7_gfx10<op>, VOP1_Real_FULL<GFX11Gen, op>,
VOP1_Real_FULL<GFX12Gen, op>;
-multiclass VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12<bits<9> op> :
+multiclass VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<bits<9> op> :
+ VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<op>,
+ VOP1_Real_FULL<GFX13Gen, op>;
+
+multiclass VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12_gfx13<bits<9> op> :
VOP1_Real_gfx6_gfx7_gfx10<op>, VOP1_Real_NO_DPP<GFX11Gen, op>,
- VOP1_Real_NO_DPP<GFX12Gen, op>;
+ VOP1_Real_NO_DPP<GFX12Gen, op>, VOP1_Real_NO_DPP<GFX13Gen, op>;
-multiclass VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_with_DPP16_gfx12<bits<9> op> :
+multiclass VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx13_with_DPP16_gfx12<bits<9> op> :
VOP1_Real_gfx6_gfx7_gfx10<op>, VOP1_Real_NO_DPP<GFX11Gen, op>,
- VOP1_Real_with_DPP16<GFX12Gen, op>;
+ VOP1_Real_with_DPP16<GFX12Gen, op>, VOP1_Real_NO_DPP<GFX13Gen, op>;
-multiclass VOP1Only_Real_gfx6_gfx7_gfx10_gfx11_gfx12<bits<9> op> :
- VOP1Only_Real_gfx6_gfx7<op>, VOP1Only_Real_gfx10_gfx11_gfx12<op>;
+multiclass VOP1Only_Real_gfx6_gfx7_gfx10_gfx11_gfx12_gfx13<bits<9> op> :
+ VOP1Only_Real_gfx6_gfx7<op>, VOP1Only_Real_gfx10_gfx11_gfx12_gfx13<op>;
defm V_LOG_CLAMP_F32 : VOP1_Real_gfx6_gfx7<0x026>;
defm V_RCP_CLAMP_F32 : VOP1_Real_gfx6_gfx7<0x028>;
@@ -1333,59 +1437,63 @@ defm V_RSQ_LEGACY_F32 : VOP1_Real_gfx6_gfx7<0x02d>;
defm V_RCP_CLAMP_F64 : VOP1_Real_gfx6_gfx7<0x030>;
defm V_RSQ_CLAMP_F64 : VOP1_Real_gfx6_gfx7<0x032>;
-defm V_NOP : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12<0x000>;
-defm V_MOV_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x001>;
-defm V_READFIRSTLANE_B32 : VOP1Only_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x002>;
-defm V_CVT_I32_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_with_DPP16_gfx12<0x003>;
-defm V_CVT_F64_I32 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_with_DPP16_gfx12<0x004>;
-defm V_CVT_F32_I32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x005>;
-defm V_CVT_F32_U32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x006>;
-defm V_CVT_U32_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x007>;
-defm V_CVT_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x008>;
+defm V_NOP : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12_gfx13<0x000>;
+defm V_MOV_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x001>;
+defm V_READFIRSTLANE_B32 : VOP1Only_Real_gfx6_gfx7_gfx10_gfx11_gfx12_gfx13<0x002>;
+defm V_CVT_I32_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx13_with_DPP16_gfx12<0x003>;
+defm V_CVT_F64_I32 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx13_with_DPP16_gfx12<0x004>;
+defm V_CVT_F32_I32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x005>;
+defm V_CVT_F32_U32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x006>;
+defm V_CVT_U32_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x007>;
+defm V_CVT_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x008>;
defm V_CVT_F16_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x00a>;
defm V_CVT_F32_F16 : VOP1_Real_gfx6_gfx7_gfx10<0x00b>;
defm V_CVT_RPI_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x00c>;
defm V_CVT_FLR_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x00d>;
-defm V_CVT_OFF_F32_I4 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x00e>;
-defm V_CVT_F32_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_with_DPP16_gfx12<0x00f>;
-defm V_CVT_F64_F32 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_with_DPP16_gfx12<0x010>;
-defm V_CVT_F32_UBYTE0 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x011>;
-defm V_CVT_F32_UBYTE1 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x012>;
-defm V_CVT_F32_UBYTE2 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x013>;
-defm V_CVT_F32_UBYTE3 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x014>;
-defm V_CVT_U32_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_with_DPP16_gfx12<0x015>;
-defm V_CVT_F64_U32 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_with_DPP16_gfx12<0x016>;
-defm V_FRACT_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x020>;
-defm V_TRUNC_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x021>;
-defm V_CEIL_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x022>;
-defm V_RNDNE_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x023>;
-defm V_FLOOR_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x024>;
-defm V_EXP_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x025>;
-defm V_LOG_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x027>;
-defm V_RCP_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x02a>;
-defm V_RCP_IFLAG_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x02b>;
-defm V_RSQ_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x02e>;
-defm V_RCP_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12<0x02f>;
-defm V_RSQ_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12<0x031>;
-defm V_SQRT_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x033>;
-defm V_SQRT_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12<0x034>;
-defm V_SIN_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x035>;
-defm V_COS_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x036>;
-defm V_NOT_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x037>;
-defm V_BFREV_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x038>;
+defm V_CVT_OFF_F32_I4 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x00e>;
+defm V_CVT_F32_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx13_with_DPP16_gfx12<0x00f>;
+defm V_CVT_F64_F32 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx13_with_DPP16_gfx12<0x010>;
+defm V_CVT_F32_UBYTE0 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x011>;
+defm V_CVT_F32_UBYTE1 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x012>;
+defm V_CVT_F32_UBYTE2 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x013>;
+defm V_CVT_F32_UBYTE3 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x014>;
+defm V_CVT_U32_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx13_with_DPP16_gfx12<0x015>;
+defm V_CVT_F64_U32 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx13_with_DPP16_gfx12<0x016>;
+defm V_FRACT_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x020>;
+defm V_TRUNC_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x021>;
+defm V_CEIL_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x022>;
+defm V_RNDNE_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x023>;
+defm V_FLOOR_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x024>;
+defm V_EXP_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x025>;
+defm V_LOG_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x027>;
+defm V_RCP_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x02a>;
+defm V_RCP_IFLAG_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x02b>;
+defm V_RSQ_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x02e>;
+defm V_RCP_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12_gfx13<0x02f>;
+defm V_RSQ_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12_gfx13<0x031>;
+defm V_SQRT_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x033>;
+defm V_SQRT_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12_gfx13<0x034>;
+defm V_SIN_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x035>;
+defm V_COS_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x036>;
+defm V_NOT_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x037>;
+defm V_BFREV_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x038>;
defm V_FFBH_U32 : VOP1_Real_gfx6_gfx7_gfx10<0x039>;
defm V_FFBL_B32 : VOP1_Real_gfx6_gfx7_gfx10<0x03a>;
defm V_FFBH_I32 : VOP1_Real_gfx6_gfx7_gfx10<0x03b>;
-defm V_FREXP_EXP_I32_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_with_DPP16_gfx12<0x03c>;
-defm V_FREXP_MANT_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_with_DPP16_gfx12<0x03d>;
-defm V_FRACT_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_with_DPP16_gfx12<0x03e>;
-defm V_FREXP_EXP_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x03f>;
-defm V_FREXP_MANT_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x040>;
+defm V_FREXP_EXP_I32_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx13_with_DPP16_gfx12<0x03c>;
+defm V_FREXP_MANT_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx13_with_DPP16_gfx12<0x03d>;
+defm V_FRACT_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx13_with_DPP16_gfx12<0x03e>;
+defm V_FREXP_EXP_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x03f>;
+defm V_FREXP_MANT_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x040>;
defm V_CLREXCP : VOP1_Real_gfx6_gfx7_gfx10<0x041>;
defm V_MOVRELD_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x042>;
defm V_MOVRELS_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x043>;
defm V_MOVRELSD_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x044>;
+def : AMDGPUMnemonicAlias<"v_brev_b32", "v_bfrev_b32"> {
+ let AssemblerPredicate = isGFX13Plus;
+}
+
//===----------------------------------------------------------------------===//
// GFX8, GFX9 (VI).
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index d87d250..2ccf392 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -15,8 +15,8 @@ class VOP2e <bits<6> op, VOPProfile P> : Enc32 {
bits<9> src0;
bits<8> src1;
- let Inst{8-0} = !if(P.HasSrc0, src0, 0);
- let Inst{16-9} = !if(P.HasSrc1, src1, 0);
+ let Inst{8-0} = !if(P.HasSrc0, src0, ?);
+ let Inst{16-9} = !if(P.HasSrc1, src1, ?);
let Inst{24-17} = !if(P.EmitDst, vdst, 0);
let Inst{30-25} = op;
let Inst{31} = 0x0; //encoding
@@ -28,8 +28,8 @@ class VOP2_MADKe <bits<6> op, VOPProfile P> : Enc64 {
bits<8> src1;
bits<32> imm;
- let Inst{8-0} = !if(P.HasSrc0, src0, 0);
- let Inst{16-9} = !if(P.HasSrc1, src1, 0);
+ let Inst{8-0} = !if(P.HasSrc0, src0, ?);
+ let Inst{16-9} = !if(P.HasSrc1, src1, ?);
let Inst{24-17} = !if(P.EmitDst, vdst, 0);
let Inst{30-25} = op;
let Inst{31} = 0x0; // encoding
@@ -42,8 +42,8 @@ class VOP2_MADK64e <bits<6> op, VOPProfile P> : Enc96 {
bits<8> src1;
bits<64> imm;
- let Inst{8-0} = !if(P.HasSrc0, src0, 0);
- let Inst{16-9} = !if(P.HasSrc1, src1, 0);
+ let Inst{8-0} = !if(P.HasSrc0, src0, ?);
+ let Inst{16-9} = !if(P.HasSrc1, src1, ?);
let Inst{24-17} = !if(P.EmitDst, vdst, 0);
let Inst{30-25} = op;
let Inst{31} = 0x0; // encoding
@@ -55,7 +55,7 @@ class VOP2_SDWAe <bits<6> op, VOPProfile P> : VOP_SDWAe <P> {
bits<8> src1;
let Inst{8-0} = 0xf9; // sdwa
- let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, 0);
+ let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, ?);
let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
let Inst{30-25} = op;
let Inst{31} = 0x0; // encoding
@@ -66,11 +66,11 @@ class VOP2_SDWA9Ae <bits<6> op, VOPProfile P> : VOP_SDWA9Ae <P> {
bits<9> src1;
let Inst{8-0} = 0xf9; // sdwa
- let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, 0);
+ let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, ?);
let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
let Inst{30-25} = op;
let Inst{31} = 0x0; // encoding
- let Inst{63} = !if(P.HasSrc1, src1{8}, 0); // src1_sgpr
+ let Inst{63} = !if(P.HasSrc1, src1{8}, ?); // src1_sgpr
}
class VOP2_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], string suffix = "_e32"> :
@@ -971,7 +971,7 @@ defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_I32_I32_I32, int_a
} // End IsNeverUniform = 1
defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_F32_F32_I32, any_fldexp>;
-let ReadsModeReg = 0, mayRaiseFPException = 0 in {
+let ReadsModeReg = 0, mayRaiseFPException = 0, SubtargetPredicate = HasCvtPkNormVOP2Insts in {
defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_V2I16_F32_F32, AMDGPUpknorm_i16_f32>;
defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_V2I16_F32_F32, AMDGPUpknorm_u16_f32>;
}
@@ -1266,14 +1266,14 @@ let Constraints = "$vdst = $src2",
defm V_FMAC_F32 : VOP2Inst_VOPD <"v_fmac_f32", VOP_MAC_F32, 0x0, "v_fmac_f32">;
} // End SubtargetPredicate = HasDLInsts
-let SubtargetPredicate = HasFmaLegacy32 in {
+let SubtargetPredicate = HasFmacLegacy32 in {
let Constraints = "$vdst = $src2",
isConvertibleToThreeAddress = 1,
isCommutable = 1 in
defm V_FMAC_LEGACY_F32 : VOP2Inst <"v_fmac_legacy_f32", VOP_MAC_LEGACY_F32>;
-} // End SubtargetPredicate = HasFmaLegacy32
+} // End SubtargetPredicate = HasFmacLegacy32
let SubtargetPredicate = HasFmacF64Inst,
Constraints = "$vdst = $src2",
@@ -1348,10 +1348,15 @@ let isCommutable = 1 in
def V_FMAAK_F64 : VOP2_Pseudo<"v_fmaak_f64", VOP_MADAK_F64, [], "">;
} // End SubtargetPredicate = HasFmaakFmamkF64Insts, isReMaterializable = 1, FixedSize = 1, Size = 12, SchedRW = [Write64Bit]
+// A dedicated profile for V_PK_FMAC_F16.
+def VOP_V2F16_V2F16_V2F16_SPLAT : VOPProfile <[v2f16, v2f16, v2f16, untyped]> {
+ let Src0RC32 = VSrc_v2f16_splat;
+}
+
let SubtargetPredicate = HasPkFmacF16Inst in {
// FIXME: V_PK_FMAC_F16 is currently not used in instruction selection.
// If this changes, ensure the DPP variant is not used for GFX11+.
-defm V_PK_FMAC_F16 : VOP2Inst<"v_pk_fmac_f16", VOP_V2F16_V2F16_V2F16>;
+defm V_PK_FMAC_F16 : VOP2Inst<"v_pk_fmac_f16", VOP_V2F16_V2F16_V2F16_SPLAT>;
} // End SubtargetPredicate = HasPkFmacF16Inst
// Note: 16-bit instructions produce a 0 result in the high 16-bits
@@ -1481,7 +1486,7 @@ let SubtargetPredicate = isGFX12Plus, isReMaterializable = 1 in {
} // End SubtargetPredicate = isGFX12Plus, isReMaterializable = 1
let SubtargetPredicate = HasIEEEMinimumMaximumInsts, isReMaterializable = 1,
- SchedRW = [WriteDoubleAdd], isCommutable = 1 in {
+ SchedRW = [WriteDoubleAdd], isCommutable = 1, IsDPMACCInstruction = 1 in {
defm V_MIN_NUM_F64 : VOP2Inst_VOPD <"v_min_num_f64", VOP_F64_F64_F64, 0x24, "v_min_num_f64", fminnum_like>;
defm V_MAX_NUM_F64 : VOP2Inst_VOPD <"v_max_num_f64", VOP_F64_F64_F64, 0x23, "v_max_num_f64", fmaxnum_like>;
}
@@ -1502,7 +1507,7 @@ class VOP2_DPP<bits<6> op, VOP2_DPP_Pseudo ps,
bits<8> vdst;
bits<8> src1;
let Inst{8-0} = 0xfa;
- let Inst{16-9} = !if(p.HasSrc1, src1{7-0}, 0);
+ let Inst{16-9} = !if(p.HasSrc1, src1{7-0}, ?);
let Inst{24-17} = !if(p.EmitDst, vdst{7-0}, 0);
let Inst{30-25} = op;
let Inst{31} = 0x0;
@@ -1544,7 +1549,7 @@ class VOP2_DPP8<bits<6> op, VOP2_Pseudo ps,
bits<8> src1;
let Inst{8-0} = fi;
- let Inst{16-9} = !if(p.HasSrc1, src1{7-0}, 0);
+ let Inst{16-9} = !if(p.HasSrc1, src1{7-0}, ?);
let Inst{24-17} = !if(p.EmitDst, vdst{7-0}, 0);
let Inst{30-25} = op;
let Inst{31} = 0x0;
@@ -2346,7 +2351,7 @@ class VOP2_DPPe <bits<6> op, VOP2_DPP_Pseudo ps, VOPProfile P = ps.Pfl> :
bits<8> vdst;
bits<8> src1;
let Inst{8-0} = 0xfa; //dpp
- let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, 0);
+ let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, ?);
let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
let Inst{30-25} = op;
let Inst{31} = 0x0; //encoding
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 42ec8ba..bdcf04f 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -151,7 +151,7 @@ class getInterp16Ins <bit HasSrc2, bit HasOMod,
class VOP3_INTERP16 <list<ValueType> ArgVT, bit OpSel = 0> : VOPProfile<ArgVT> {
let IsSingle = 1;
- let HasOMod = !ne(DstVT.Value, f16.Value);
+ let HasOMod = !ne(DstVT, f16);
let HasHigh = 1;
let HasOpSel = OpSel;
@@ -185,7 +185,8 @@ defm V_FMA_LEGACY_F32 : VOP3Inst <"v_fma_legacy_f32",
defm V_MAD_I32_I24 : VOP3Inst <"v_mad_i32_i24", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
defm V_MAD_U32_U24 : VOP3Inst <"v_mad_u32_u24", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
defm V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, any_fma>, VOPD_Component<0x13, "v_fma_f32">;
-defm V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_lerp>;
+let SubtargetPredicate = HasLerpInst in
+ defm V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_lerp>;
let SchedRW = [WriteIntMul] in {
let SubtargetPredicate = HasMadU32Inst in
@@ -198,9 +199,11 @@ let SchedRW = [WriteIntMul] in {
let SchedRW = [WriteDoubleAdd] in {
let FPDPRounding = 1 in {
+let IsDPMACCInstruction = 1 in
defm V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP_F64_F64_F64_F64_DPP_PROF, any_fma>, VOPD_Component<0x20, "v_fma_f64">;
let SubtargetPredicate = isNotGFX12Plus in {
defm V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile<VOP_F64_F64_F64>, any_fadd>;
+let IsDPMACCInstruction = 1 in
defm V_MUL_F64 : VOP3Inst <"v_mul_f64", VOP3_Profile<VOP_F64_F64_F64>, any_fmul>;
} // End SubtargetPredicate = isNotGFX12Plus
} // End FPDPRounding = 1
@@ -223,10 +226,10 @@ defm V_MAXIMUM_F32 : VOP3Inst <"v_maximum_f32", VOP3_Profile<VOP_F32_F32_F32>, f
defm V_MINIMUM_F16 : VOP3Inst_t16 <"v_minimum_f16", VOP_F16_F16_F16, fminimum>;
defm V_MAXIMUM_F16 : VOP3Inst_t16 <"v_maximum_f16", VOP_F16_F16_F16, fmaximum>;
-let SchedRW = [WriteDoubleAdd] in {
+let SchedRW = [WriteDoubleAdd], IsDPMACCInstruction = 1 in {
defm V_MINIMUM_F64 : VOP3Inst <"v_minimum_f64", VOP3_Profile<VOP_F64_F64_F64>, fminimum>;
defm V_MAXIMUM_F64 : VOP3Inst <"v_maximum_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaximum>;
-} // End SchedRW = [WriteDoubleAdd]
+} // End SchedRW = [WriteDoubleAdd], IsDPMACCInstruction = 1
} // End SubtargetPredicate = HasIEEEMinimumMaximumInsts, ReadsModeReg = 0, AddedComplexity = 1
let SubtargetPredicate = isGFX1250Plus, SchedRW = [WriteDoubleAdd] in {
@@ -251,19 +254,19 @@ defm V_DIV_FMAS_F32 : VOP3Inst_Pseudo_Wrapper <"v_div_fmas_f32", VOP_F32_F32_F32
// if (vcc)
// result *= 2^64
//
-let SchedRW = [WriteDouble], FPDPRounding = 1 in
+let SchedRW = [WriteDouble], FPDPRounding = 1, IsDPMACCInstruction = 1 in
defm V_DIV_FMAS_F64 : VOP3Inst <"v_div_fmas_f64", VOP_F64_F64_F64_F64_VCC>;
} // End Uses = [MODE, VCC, EXEC]
} // End isCommutable = 1
let isReMaterializable = 1 in {
-let mayRaiseFPException = 0 in {
+let mayRaiseFPException = 0, SubtargetPredicate = HasCubeInsts in {
defm V_CUBEID_F32 : VOP3Inst <"v_cubeid_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubeid>;
defm V_CUBESC_F32 : VOP3Inst <"v_cubesc_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubesc>;
defm V_CUBETC_F32 : VOP3Inst <"v_cubetc_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubetc>;
defm V_CUBEMA_F32 : VOP3Inst <"v_cubema_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubema>;
-} // End mayRaiseFPException
+} // mayRaiseFPException = 0, SubtargetPredicate = HasCubeInsts
defm V_BFE_U32 : VOP3Inst <"v_bfe_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfe_u32>;
defm V_BFE_I32 : VOP3Inst <"v_bfe_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfe_i32>;
@@ -306,20 +309,20 @@ let SubtargetPredicate = HasMinimum3Maximum3F32, ReadsModeReg = 0 in {
defm V_MAXIMUM3_F32 : VOP3Inst <"v_maximum3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmaximum3>;
} // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0
-let isCommutable = 1 in {
+let isCommutable = 1, SubtargetPredicate = HasSadInsts in {
defm V_SAD_U8 : VOP3Inst <"v_sad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
defm V_SAD_HI_U8 : VOP3Inst <"v_sad_hi_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
defm V_SAD_U16 : VOP3Inst <"v_sad_u16", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
defm V_SAD_U32 : VOP3Inst <"v_sad_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
-} // End isCommutable = 1
+} // End isCommutable = 1, SubtargetPredicate = HasSadInsts
defm V_CVT_PK_U8_F32 : VOP3Inst<"v_cvt_pk_u8_f32", VOP3_Profile<VOP_I32_F32_I32_I32>, int_amdgcn_cvt_pk_u8_f32>;
defm V_DIV_FIXUP_F32 : VOP3Inst <"v_div_fixup_f32", DIV_FIXUP_F32_PROF, AMDGPUdiv_fixup>;
-let SchedRW = [WriteDoubleAdd], FPDPRounding = 1 in {
+let SchedRW = [WriteDoubleAdd], FPDPRounding = 1, IsDPMACCInstruction = 1 in {
defm V_DIV_FIXUP_F64 : VOP3Inst <"v_div_fixup_f64", VOP_F64_F64_F64_F64_DPP_PROF, AMDGPUdiv_fixup>;
defm V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile<VOP_F64_F64_I32>, any_fldexp>;
-} // End SchedRW = [WriteDoubleAdd], FPDPRounding = 1
+} // End SchedRW = [WriteDoubleAdd], FPDPRounding = 1, IsDPMACCInstruction = 1
} // End isReMaterializable = 1
let SubtargetPredicate = isGFX9GFX10 in
@@ -357,7 +360,7 @@ let mayRaiseFPException = 0 in { // Seems suspicious but manual doesn't say it d
defm V_DIV_SCALE_F32 : VOP3Inst_Pseudo_Wrapper <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32> ;
// Double precision division pre-scale.
- let SchedRW = [WriteDouble, WriteSALU], FPDPRounding = 1 in
+ let SchedRW = [WriteDouble, WriteSALU], FPDPRounding = 1, IsDPMACCInstruction = 1 in
defm V_DIV_SCALE_F64 : VOP3Inst <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64>;
} // End mayRaiseFPException = 0
@@ -370,12 +373,12 @@ defm V_MQSAD_PK_U16_U8 : VOP3Inst <"v_mqsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64
let isReMaterializable = 1 in {
-let SchedRW = [WriteDouble] in {
+let SchedRW = [WriteDouble], IsDPMACCInstruction = 1 in {
defm V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I32>, int_amdgcn_trig_preop>;
-} // End SchedRW = [WriteDouble]
+} // End SchedRW = [WriteDouble], IsDPMACCInstruction = 1
let SchedRW = [Write64Bit] in {
- let SubtargetPredicate = isGFX6GFX7 in {
+ let SubtargetPredicate = isGFX6GFX7, IsDPMACCInstruction = 1 in {
defm V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile<VOP_I64_I64_I32>, cshl_64>;
defm V_LSHR_B64 : VOP3Inst <"v_lshr_b64", VOP3_Profile<VOP_I64_I64_I32>, csrl_64>;
defm V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_I64_I64_I32>, csra_64>;
@@ -424,15 +427,16 @@ def VOPProfileMQSAD : VOP3_Profile<VOP_V4I32_I64_I32_V4I32, VOP3_CLAMP> {
let SubtargetPredicate = isGFX7Plus in {
let Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] in {
-defm V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>;
+let SubtargetPredicate = HasQsadInsts in
+ defm V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>;
defm V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOPProfileMQSAD>;
} // End Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32]
} // End SubtargetPredicate = isGFX7Plus
let isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU] in {
let SubtargetPredicate = isGFX7Plus in {
- defm V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64_DPP, null_frag, [HasNotMADIntraFwdBug]>;
- defm V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64_DPP, null_frag, [HasNotMADIntraFwdBug]>;
+ defm V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64_DPP, null_frag, [NotHasMADIntraFwdBug]>;
+ defm V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64_DPP, null_frag, [NotHasMADIntraFwdBug]>;
}
let SubtargetPredicate = isGFX11Only, OtherPredicates = [HasMADIntraFwdBug],
Constraints = "@earlyclobber $vdst" in {
@@ -634,19 +638,13 @@ class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : ThreeOpFragSDA
}
def shl_0_to_4 : PatFrag<
- (ops node:$src0, node:$src1), (shl node:$src0, node:$src1),
- [{
- if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
- return C->getZExtValue() <= 4;
- }
- return false;
- }]> {
+ (ops node:$src0, node:$src1), (shl node:$src0, node:$src1), [{
+ KnownBits KB = CurDAG->computeKnownBits(N->getOperand(1));
+ return KB.getMaxValue().getZExtValue() <= 4;
+ }]> {
let GISelPredicateCode = [{
- int64_t Imm = 0;
- if (!mi_match(MI.getOperand(2).getReg(), MRI, m_ICst(Imm)) &&
- !mi_match(MI.getOperand(2).getReg(), MRI, m_Copy(m_ICst(Imm))))
- return false;
- return (uint64_t)Imm <= 4;
+ KnownBits KB = VT->getKnownBits(MI.getOperand(2).getReg());
+ return KB.getMaxValue().getZExtValue() <= 4;
}];
}
@@ -775,10 +773,10 @@ let SubtargetPredicate = HasMinimum3Maximum3F16, ReadsModeReg = 0 in {
} // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0
let SubtargetPredicate = HasAddMinMaxInsts, isCommutable = 1, isReMaterializable = 1 in {
- defm V_ADD_MAX_I32 : VOP3Inst <"v_add_max_i32", VOP_I32_I32_I32_I32>;
- defm V_ADD_MAX_U32 : VOP3Inst <"v_add_max_u32", VOP_I32_I32_I32_I32>;
- defm V_ADD_MIN_I32 : VOP3Inst <"v_add_min_i32", VOP_I32_I32_I32_I32>;
- defm V_ADD_MIN_U32 : VOP3Inst <"v_add_min_u32", VOP_I32_I32_I32_I32>;
+ defm V_ADD_MAX_I32 : VOP3Inst <"v_add_max_i32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>, int_amdgcn_add_max_i32>;
+ defm V_ADD_MAX_U32 : VOP3Inst <"v_add_max_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>, int_amdgcn_add_max_u32>;
+ defm V_ADD_MIN_I32 : VOP3Inst <"v_add_min_i32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>, int_amdgcn_add_min_i32>;
+ defm V_ADD_MIN_U32 : VOP3Inst <"v_add_min_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>, int_amdgcn_add_min_u32>;
}
defm V_ADD_I16 : VOP3Inst_t16 <"v_add_i16", VOP_I16_I16_I16>;
@@ -789,9 +787,6 @@ let isCommutable = 1 in {
defm V_MAD_I32_I16 : VOP3Inst_t16 <"v_mad_i32_i16", VOP_I32_I16_I16_I32>;
} // End isCommutable = 1
-defm V_CVT_PKNORM_I16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_i16_f16", VOP_B32_F16_F16>;
-defm V_CVT_PKNORM_U16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_u16_f16", VOP_B32_F16_F16>;
-
defm V_PACK_B32_F16 : VOP3Inst_t16 <"v_pack_b32_f16", VOP_B32_F16_F16>;
let isReMaterializable = 1 in {
@@ -820,13 +815,13 @@ let OtherPredicates = [HasFP8ConversionInsts], mayRaiseFPException = 0,
VOP3_CVT_PK_F8_F32_Profile_t16<>,
VOP3_CVT_PK_F8_F32_Profile_fake16<>>;
- let SubtargetPredicate = isGFX12Plus in {
+ let SubtargetPredicate = isGFX11Plus in {
let OtherPredicates = [HasFP8ConversionInsts, NotHasFP8E5M3Insts] in
defm V_CVT_SR_FP8_F32_gfx12 : VOP3Inst<"v_cvt_sr_fp8_f32_gfx12", VOP3_CVT_SR_F8_ByteSel_Profile<f32>>;
let OtherPredicates = [HasFP8ConversionInsts, HasFP8E5M3Insts] in
defm V_CVT_SR_FP8_F32_gfx1250 : VOP3Inst<"v_cvt_sr_fp8_f32_gfx1250", VOP3_CVT_SR_F8_ByteSel_Profile<f32, true>>;
defm V_CVT_SR_BF8_F32_gfx12 : VOP3Inst<"v_cvt_sr_bf8_f32_gfx12", VOP3_CVT_SR_F8_ByteSel_Profile<f32>>;
- }
+ } // End SubtargetPredicate = isGFX11Plus
}
// These instructions have non-standard use of op_sel. In particular they are
@@ -930,7 +925,7 @@ let SubtargetPredicate = isGFX940Plus in {
}
}
-let SubtargetPredicate = isGFX12Plus in {
+let SubtargetPredicate = isGFX11Plus in {
let OtherPredicates = [HasFP8ConversionInsts, NotHasFP8E5M3Insts] in
def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_fp8_f32, V_CVT_SR_FP8_F32_gfx12_e64, f32>;
let OtherPredicates = [HasFP8ConversionInsts, HasFP8E5M3Insts] in {
@@ -938,7 +933,7 @@ let SubtargetPredicate = isGFX12Plus in {
def : Cvt_SR_F8_ByteSel_E5M3_Pat<int_amdgcn_cvt_sr_fp8_f32_e5m3, V_CVT_SR_FP8_F32_gfx1250_e64, f32, DSTCLAMP.ENABLE>;
}
def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_bf8_f32, V_CVT_SR_BF8_F32_gfx12_e64, f32>;
-}
+} // End SubtargetPredicate = isGFX11Plus
}
class ThreeOp_i32_Pats <SDPatternOperator op1, SDPatternOperator op2, Instruction inst> : GCNPat <
@@ -976,10 +971,10 @@ def : GCNPat <
} // End SubtargetPredicate = HasLshlAddU64Inst
let SubtargetPredicate = HasAddMinMaxInsts in {
-def : ThreeOp_i32_Pats<add, smax, V_ADD_MAX_I32_e64>;
-def : ThreeOp_i32_Pats<add, umax, V_ADD_MAX_U32_e64>;
-def : ThreeOp_i32_Pats<add, smin, V_ADD_MIN_I32_e64>;
-def : ThreeOp_i32_Pats<add, umin, V_ADD_MIN_U32_e64>;
+def : ThreeOp_i32_Pats<saddsat, smax, V_ADD_MAX_I32_e64>;
+def : ThreeOp_i32_Pats<uaddsat, umax, V_ADD_MAX_U32_e64>;
+def : ThreeOp_i32_Pats<saddsat, smin, V_ADD_MIN_I32_e64>;
+def : ThreeOp_i32_Pats<uaddsat, umin, V_ADD_MIN_U32_e64>;
}
def : VOPBinOpClampPat<saddsat, V_ADD_I32_e64, i32>;
@@ -996,6 +991,11 @@ def : GCNPat<(DivergentBinFrag<or> (or_oneuse i64:$src0, i64:$src1), i64:$src2),
} // End SubtargetPredicate = isGFX9Plus
+let SubtargetPredicate = HasCvtPkNormVOP3Insts in {
+ defm V_CVT_PKNORM_I16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_i16_f16", VOP_B32_F16_F16>;
+ defm V_CVT_PKNORM_U16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_u16_f16", VOP_B32_F16_F16>;
+} // end SubtargetPredicate = HasCvtPkNormVOP3Insts
+
// FIXME: Probably should hardcode clamp bit in pseudo and avoid this.
class OpSelBinOpClampPat<SDPatternOperator node,
Instruction inst> : GCNPat<
@@ -1061,7 +1061,7 @@ multiclass IMAD32_Mul24_Pats<VOP3_Pseudo inst> {
}
// exclude pre-GFX9 where it was slow
-let OtherPredicates = [HasNotMADIntraFwdBug], SubtargetPredicate = isGFX9Plus in {
+let OtherPredicates = [NotHasMADIntraFwdBug], SubtargetPredicate = isGFX9Plus in {
defm : IMAD32_Pats<V_MAD_U64_U32_e64>;
defm : IMAD32_Mul24_Pats<V_MAD_U64_U32_e64>;
}
@@ -1717,6 +1717,28 @@ let SubtargetPredicate = isGFX11Plus in {
defm V_MINMAX_I32 : VOP3Inst<"v_minmax_i32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
defm V_CVT_PK_I16_F32 : VOP3Inst<"v_cvt_pk_i16_f32", VOP3_Profile<VOP_V2I16_F32_F32>>;
defm V_CVT_PK_U16_F32 : VOP3Inst<"v_cvt_pk_u16_f32", VOP3_Profile<VOP_V2I16_F32_F32>>;
+
+ def : GCNPat<(v2i16 (build_vector (i16 (fp_to_sint (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
+ (i16 (fp_to_sint (f32 (VOP3Mods f32:$src1, i32:$src1_modifiers)))))),
+ (V_CVT_PK_I16_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1)>;
+ def : GCNPat<(v2i16 (build_vector (i16 (fp_to_uint (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
+ (i16 (fp_to_uint (f32 (VOP3Mods f32:$src1, i32:$src1_modifiers)))))),
+ (V_CVT_PK_U16_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1)>;
+
+ // Fallback patterns for f32->i16 conversion. These are only required because
+ // f32->i16 has to be legal so that we can select V_CVT_PK_[IU]16_F32 above.
+ let True16Predicate = UseRealTrue16Insts in {
+ def : GCNPat<(i16 (fp_to_sint (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
+ (EXTRACT_SUBREG (V_CVT_I32_F32_e64 $src0_modifiers, $src0), lo16)>;
+ def : GCNPat<(i16 (fp_to_uint (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
+ (EXTRACT_SUBREG (V_CVT_U32_F32_e64 $src0_modifiers, $src0), lo16)>;
+ }
+ let True16Predicate = NotUseRealTrue16Insts in {
+ def : GCNPat<(i16 (fp_to_sint (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
+ (V_CVT_I32_F32_e64 $src0_modifiers, $src0)>;
+ def : GCNPat<(i16 (fp_to_uint (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
+ (V_CVT_U32_F32_e64 $src0_modifiers, $src0)>;
+ }
} // End SubtargetPredicate = isGFX11Plus
class VOP3_CVT_SR_FP16_TiedInput_Profile<VOPProfile P> : VOP3_CVT_SCALE_F1632_FP8BF8_TiedInput_Profile<P> {
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 6500fce..9a4054b 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -75,7 +75,7 @@ multiclass VOP3PInst<string OpName, VOPProfile P,
SDPatternOperator node = null_frag, bit IsDOT = 0> {
def NAME : VOP3P_Pseudo<OpName, P,
!if (P.HasModifiers,
- getVOP3PModPat<P, node, IsDOT, IsDOT>.ret,
+ getVOP3PModPat<P, node, !or(P.EnableClamp, IsDOT), IsDOT>.ret,
getVOP3Pat<P, node>.ret)>;
let SubtargetPredicate = isGFX11Plus in {
if P.HasExtVOP3DPP then
@@ -182,6 +182,8 @@ multiclass MadFmaMixFP32Pats<SDPatternOperator fma_like,
ValueType VT = f16> {
defvar VOP3PMadMixModsPat = !if (!eq(VT, bf16), VOP3PMadMixBF16Mods, VOP3PMadMixMods);
defvar VOP3PMadMixModsExtPat = !if (!eq(VT, bf16), VOP3PMadMixBF16ModsExt, VOP3PMadMixModsExt);
+ defvar OneImm = !if (!eq(VT, bf16), CONST.BF16_ONE, CONST.FP16_ONE);
+ defvar NegOneImm = !if (!eq(VT, bf16), CONST.BF16_NEG_ONE, CONST.FP16_NEG_ONE);
// At least one of the operands needs to be an fpextend of an f16
// for this to be worthwhile, so we need three patterns here.
// TODO: Could we use a predicate to inspect src1/2/3 instead?
@@ -203,6 +205,34 @@ multiclass MadFmaMixFP32Pats<SDPatternOperator fma_like,
(f32 (VOP3PMadMixModsExtPat VT:$src2, i32:$src2_mods)))),
(mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2,
DSTCLAMP.NONE)>;
+
+ // (fadd x, y) -> (fma x, 1.0, y)
+ def : GCNPat <
+ (f32 (fadd (f32 (VOP3PMadMixModsExtPat VT:$src0, i32:$src0_mods)),
+ (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_mods)))),
+ (mix_inst $src0_mods, $src0, (i32 8), (i32 OneImm), $src1_mods, $src1,
+ DSTCLAMP.NONE)>;
+
+ // (fmul x, y) -> (fma x, y, -0.0)
+ def : GCNPat <
+ (f32 (fmul (f32 (VOP3PMadMixModsExtPat VT:$src0, i32:$src0_mods)),
+ (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_mods)))),
+ (mix_inst $src0_mods, $src0, $src1_mods, $src1, (i32 SRCMODS.NEG), (i32 0),
+ DSTCLAMP.NONE)>;
+
+ // (fsub x, y) -> (fma y, -1.0, x)
+ def : GCNPat <
+ (f32 (fsub (f32 (VOP3PMadMixModsExtPat VT:$src0, i32:$src0_mods)),
+ (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_mods)))),
+ (mix_inst $src1_mods, $src1, (i32 8), (i32 NegOneImm), $src0_mods, $src0,
+ DSTCLAMP.NONE)>;
+
+ // (fsub x, y) -> (fma y, -1.0, x)
+ def : GCNPat <
+ (f32 (fsub (f32 (VOP3PMadMixModsPat f32:$src0, i32:$src0_mods)),
+ (f32 (VOP3PMadMixModsExtPat VT:$src1, i32:$src1_mods)))),
+ (mix_inst $src1_mods, $src1, (i32 8), (i32 NegOneImm), $src0_mods, $src0,
+ DSTCLAMP.NONE)>;
}
multiclass MadFmaMixFP16Pats<SDPatternOperator fma_like,
@@ -235,7 +265,7 @@ multiclass MadFmaMixFP16Pats<SDPatternOperator fma_like,
(f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_modifiers))))),
(mixlo_inst $src0_modifiers, $src0,
$src1_modifiers, $src1,
- (i32 0), (i32 0),
+ (i32 SRCMODS.NONE), (i32 0),
DSTCLAMP.NONE,
(i32 (IMPLICIT_DEF)))
>;
@@ -245,7 +275,7 @@ multiclass MadFmaMixFP16Pats<SDPatternOperator fma_like,
(f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_modifiers)))))),
(vecVT (mixhi_inst $src0_modifiers, $src0,
$src1_modifiers, $src1,
- (i32 0), (i32 0),
+ (i32 SRCMODS.NONE), (i32 0),
DSTCLAMP.NONE,
VGPR_32:$elt0))
>;
@@ -299,7 +329,7 @@ multiclass MadFmaMixFP16Pats_t16<SDPatternOperator fma_like,
(f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_modifiers))))),
(mix_inst_16 $src0_modifiers, $src0,
$src1_modifiers, $src1,
- (i32 0), (i32 0),
+ (i32 SRCMODS.NONE), (i32 0),
DSTCLAMP.NONE)
>;
@@ -434,15 +464,16 @@ defm : MadFmaMixFP16Pats_t16<fma, V_FMA_MIX_BF16_t16>;
} // End SubtargetPredicate = HasFmaMixBF16Insts
def PK_ADD_MINMAX_Profile : VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16, VOP3_PACKED> {
- let HasModifiers = 0;
+ let HasNeg = 0;
+ let EnableClamp = 1;
}
let isCommutable = 1, isReMaterializable = 1 in {
let SubtargetPredicate = HasPkAddMinMaxInsts in {
-defm V_PK_ADD_MAX_I16 : VOP3PInst<"v_pk_add_max_i16", PK_ADD_MINMAX_Profile>;
-defm V_PK_ADD_MAX_U16 : VOP3PInst<"v_pk_add_max_u16", PK_ADD_MINMAX_Profile>;
-defm V_PK_ADD_MIN_I16 : VOP3PInst<"v_pk_add_min_i16", PK_ADD_MINMAX_Profile>;
-defm V_PK_ADD_MIN_U16 : VOP3PInst<"v_pk_add_min_u16", PK_ADD_MINMAX_Profile>;
+defm V_PK_ADD_MAX_I16 : VOP3PInst<"v_pk_add_max_i16", PK_ADD_MINMAX_Profile, int_amdgcn_pk_add_max_i16>;
+defm V_PK_ADD_MAX_U16 : VOP3PInst<"v_pk_add_max_u16", PK_ADD_MINMAX_Profile, int_amdgcn_pk_add_max_u16>;
+defm V_PK_ADD_MIN_I16 : VOP3PInst<"v_pk_add_min_i16", PK_ADD_MINMAX_Profile, int_amdgcn_pk_add_min_i16>;
+defm V_PK_ADD_MIN_U16 : VOP3PInst<"v_pk_add_min_u16", PK_ADD_MINMAX_Profile, int_amdgcn_pk_add_min_u16>;
}
let SubtargetPredicate = HasPkMinMax3Insts in {
defm V_PK_MAX3_I16 : VOP3PInst<"v_pk_max3_i16", PK_ADD_MINMAX_Profile>;
@@ -463,10 +494,10 @@ class ThreeOp_OpSelClampPats <SDPatternOperator op1, SDPatternOperator op2,
>;
let SubtargetPredicate = HasPkAddMinMaxInsts in {
-def : ThreeOp_OpSelClampPats<add, smax, V_PK_ADD_MAX_I16>;
-def : ThreeOp_OpSelClampPats<add, umax, V_PK_ADD_MAX_U16>;
-def : ThreeOp_OpSelClampPats<add, smin, V_PK_ADD_MIN_I16>;
-def : ThreeOp_OpSelClampPats<add, umin, V_PK_ADD_MIN_U16>;
+def : ThreeOp_OpSelClampPats<saddsat, smax, V_PK_ADD_MAX_I16>;
+def : ThreeOp_OpSelClampPats<uaddsat, umax, V_PK_ADD_MAX_U16>;
+def : ThreeOp_OpSelClampPats<saddsat, smin, V_PK_ADD_MIN_I16>;
+def : ThreeOp_OpSelClampPats<uaddsat, umin, V_PK_ADD_MIN_U16>;
}
let SubtargetPredicate = HasPkMinMax3Insts in {
@@ -662,7 +693,6 @@ def VOP3P_DOTF8_Profile : VOP3P_Profile<VOPProfile <[f32, i32, i32, f32]>,
multiclass VOP3PDOTF8Inst <string OpName, SDPatternOperator intrinsic_node> {
defm NAME : VOP3PInst<OpName, VOP3P_DOTF8_Profile, null_frag, 1>;
- let SubtargetPredicate = isGFX12Plus in
def : GCNPat <(intrinsic_node i32:$src0, i32:$src1,
(VOP3Mods f32:$src2, i32:$src2_modifiers)),
(!cast<Instruction>(NAME) i32:$src0, i32:$src1,
@@ -995,6 +1025,7 @@ class MAIInst<string OpName, VOPProfile P, SDPatternOperator node, bit Scaled =
Instruction Opcode = !cast<Instruction>(NAME);
bit is_dgemm = 0;
bit is_gfx940_xdl = 0;
+ let isConvergent = 1;
let PseudoInstr = NAME; // FIXME: Why is this not the default
}
@@ -1032,7 +1063,7 @@ multiclass MAIInst<string OpName, string P, SDPatternOperator node = null_frag,
defvar ProfileVGPR = !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD");
- let isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 in {
+ let mayRaiseFPException = 0, ReadsModeReg = 1 in {
// FP32 denorm mode is respected, rounding mode is not. Exceptions are not supported.
let Constraints = !if(NoDstOverlap, "@earlyclobber $vdst", "") in {
def _e64 : MAIInst<OpName, ProfileAGPR,
@@ -1059,7 +1090,7 @@ multiclass MAIInst<string OpName, string P, SDPatternOperator node = null_frag,
MFMATable<1, "VGPR", NAME # "_vgprcd_e64", NAME # "_mac_e64">;
}
}
- } // End isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1
+ } // mayRaiseFPException = 0, ReadsModeReg = 1
}
// Provide a wrapper around MAIInst that provides the appended operands from V_MFMA_LD_SCALE_B32
@@ -1363,16 +1394,10 @@ class WMMAOpcodeMapping<Instruction TwoAddr, Instruction ThreeAddr> {
field bit is_wmma_xdl;
}
-def WMMAOpcode : GenericEnum {
- let FilterClass = "VOP3P_Pseudo";
-}
-
class WMMAMappingTable : GenericTable {
let FilterClass = "WMMAOpcodeMapping";
let CppTypeName = "WMMAOpcodeMappingInfo";
let Fields = ["Opcode2Addr", "Opcode3Addr"];
- string TypeOf_Opcode2Addr = "WMMAOpcode";
- string TypeOf_Opcode3Addr = "WMMAOpcode";
}
def WMMAOpcode2AddrMappingTable : WMMAMappingTable {
@@ -1401,13 +1426,13 @@ multiclass WMMAInst<string Suffix, string Instr, VOPProfile P, SDPatternOperator
defvar WMMAConstraints3Addr = "@earlyclobber $vdst";
defvar WMMAProfile = VOPProfileWMMA<P, Suffix, _Src01RC64, Type.hasClamp, Type.hasOpsel>;
- let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
+ let isConvergent = 1, Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = convertibleTo3Addr in {
def _twoaddr # Suffix : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
}
}
if convertibleTo3Addr then {
- let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
+ let isConvergent = 1, Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in {
def _threeaddr # Suffix : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
}
@@ -1453,13 +1478,12 @@ let WaveSizePredicate = isWave64 in {
}
class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
- bit _IsIU, bit _IsFP8BF8XF32, bit _Has_ImodOp = 0,
+ bit _IsIU, bit _IsFP8BF8, bit _Has_ImodOp = 0,
bit _HasMatrixFMT = 0, bit _HasMatrixScale = 0,
bit _Scale16 = 0, bit _HasMatrixReuse = 0, bit _IsF4 = 0>
: VOP3P_Profile<VOPProfile<ArgTy>> {
bit IsIU = _IsIU;
- bit NoABMods = !or(_IsFP8BF8XF32, _IsF4); // No IMOD support for A and B
- bit IsXF32 = !and(_IsFP8BF8XF32, !eq(ArgTy[1], v8f32));
+ bit NoABMods = !or(_IsFP8BF8, _IsF4); // No IMOD support for A and B
int IndexType = _IndexType;
let HasMatrixFMT = _HasMatrixFMT;
@@ -1468,7 +1492,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
let HasMatrixReuse = _HasMatrixReuse;
bit HasIModOp = _Has_ImodOp;
- let HasClamp = !and(IsIU, !not(HasIModOp));
+ let HasClamp = IsIU;
let IsPacked = 1;
let IsWMMA = !not(_IsSWMMAC);
let IsSWMMAC = _IsSWMMAC;
@@ -1487,9 +1511,9 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
bit IsC_F16 = !or(!eq(ArgTy[3], v8f16), !eq(ArgTy[3], v4f16));
bit NegLo01 = !not(NoABMods);
- bit NegLo2 = !and(!not(IsIU), !not(IsXF32), IsWMMA);
+ bit NegLo2 = !and(!not(IsIU), IsWMMA);
bit NegHi01 = IsF16BF16; // Only F16BF16 can have neg_hi[0:1]
- bit NegHi2 = !and(!not(IsIU), !not(IsXF32), IsWMMA);
+ bit NegHi2 = !and(!not(IsIU), IsWMMA);
bit NegLoAny = !or(NegLo01, NegLo2);
bit NegHiAny = !or(NegHi01, NegHi2);
@@ -1520,8 +1544,6 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
// wmma f32_f32 | neg_lo for neg A/B | neg_lo = 1 neg C(f32)
// | neg_hi ignored | neg_hi = 1 abs C(f32)
// ---------------------------------------------------------------------------
- // wmma f32_xf32 | not allowed for xf32 | not allowed
- // ---------------------------------------------------------------------------
// wmma f32_f16 | both neg_lo,neg_hi = 1 | neg_lo = 1 neg C(f32)
// wmma f32_bf16 | neg A/B (f16 or bf16) | neg_hi = 1 abs C(f32)
// ---------------------------------------------------------------------------
@@ -1552,13 +1574,13 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
// pseudo
- // fp8bf8 and xf32 wmmas don't use src (0 and 1) modifiers, iu use neg_lo, f16 and bf16
+ // fp8bf8 wmmas don't use src (0 and 1) modifiers, iu use neg_lo, f16 and bf16
// use neg_lo and neg_hi. iu wmmas (C is i32) don't use src 2 modifiers,
// remaining wmmas(f16, bf16 and f8bf8) use neg_lo and neg_hi for C (C is f32
// f16 or bf16). swmmac use index_key and don't use src 2 modifiers.
dag Src0Mods = !if(NoABMods, (ins), (ins PackedF16InputMods:$src0_modifiers));
dag Src1Mods = !if(NoABMods, (ins), (ins PackedF16InputMods:$src1_modifiers));
- dag Src2Mods = !if(!or(IsIU, IsXF32, IsSWMMAC), (ins), (ins PackedF16InputMods:$src2_modifiers));
+ dag Src2Mods = !if(!or(IsIU, IsSWMMAC), (ins), (ins PackedF16InputMods:$src2_modifiers));
dag IndexKey = !cond(!eq(IndexType, 0) : (ins),
!eq(IndexType, 8) : (ins IndexKey8bit:$index_key_8bit),
!eq(IndexType, 16): (ins IndexKey16bit:$index_key_16bit),
@@ -1573,7 +1595,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
MatrixAScaleFmt:$matrix_a_scale_fmt, MatrixBScaleFmt:$matrix_b_scale_fmt),
(ins));
dag MatrixReuse = !if(HasMatrixReuse, (ins MatrixAReuse:$matrix_a_reuse, MatrixBReuse:$matrix_b_reuse), (ins));
- dag Clamp = !if(HasClamp, (ins Clamp0:$clamp), (ins));
+ dag ClampOp = !if(HasClamp, (ins Clamp:$clamp), (ins));
dag Neg = !cond(!and(NegLoAny, NegHiAny) : (ins neg_lo0:$neg_lo, neg_hi0:$neg_hi),
!and(NegLoAny, !not(NegHiAny)) : (ins neg_lo0:$neg_lo),
!and(!not(NegLoAny), !not(NegHiAny)) : (ins));
@@ -1585,7 +1607,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
(ins VRegSrc_64:$src2),
(ins VRegSrc_32:$src2)),
IndexKey)),
- MatrixScaleSrc, MatrixFMT, MatrixScale, MatrixReuse, Clamp, Neg);
+ MatrixScaleSrc, ClampOp, MatrixFMT, MatrixScale, MatrixReuse, Neg);
// asm
@@ -1635,22 +1657,21 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
IsAB_BF16_IMod0 : (ins (i32 8), Src1VT:$src1),
IsIU : (ins (VOP3PModsNeg $src1_modifiers), Src1VT:$src1),
NoABMods : (ins Src1VT:$src1));
- bit IsC_IMod1 = !and(HasIModOp, IsWMMA, !not(IsIU), !not(IsXF32));
+ bit IsC_IMod1 = !and(HasIModOp, IsWMMA, !not(IsIU));
bit IsC_F32_IMod0 = !and(IsC_F32, !not(HasIModOp));
bit IsC_F16_IMod0 = !and(IsC_F16, !not(HasIModOp));
bit IsC_BF16_IMod0 = !and(IsC_BF16, !not(HasIModOp));
- bit IsIUXF32 = !or(IsIU, IsXF32);
dag Src2InPatWmma = !cond(IsC_IMod1 : (ins timm:$src2_modifiers, Src2VT:$src2),
IsC_F32_IMod0 : (ins (Src2VT (WMMAModsF32NegAbs Src2VT:$src2, i32:$src2_modifiers))),
IsC_F16_IMod0 : (ins (Src2VT (WMMAModsF16NegAbs Src2VT:$src2, i32:$src2_modifiers))),
IsC_BF16_IMod0 : (ins Src2VT:$src2),
- IsIUXF32 : (ins Src2VT:$src2),
+ IsIU : (ins Src2VT:$src2),
IsSWMMAC : (ins));
dag Src2OutPatWmma = !cond(IsC_IMod1 : (ins (VOP3PModsNegAbs $src2_modifiers), Src2VT:$src2),
IsC_F32_IMod0 : (ins i32:$src2_modifiers, Src2VT:$src2),
IsC_F16_IMod0 : (ins i32:$src2_modifiers, Src2VT:$src2),
IsC_BF16_IMod0 : (ins (i32 8), Src2VT:$src2),
- IsIUXF32 : (ins Src2VT:$src2),
+ IsIU : (ins Src2VT:$src2),
IsSWMMAC : (ins));
dag ClampPat = !if(HasClamp, (ins i1:$clamp), (ins));
dag IndexInPat = !cond(!eq(IndexType, 0) : (ins i32:$src2),
@@ -1663,7 +1684,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
!eq(IndexType, 32): (ins i64:$src2, i32:$index_key_32bit));
dag MatrixFMTOutPat = !if(HasMatrixFMT, (ins i32:$matrix_a_fmt, i32:$matrix_b_fmt), (ins));
dag Src2InlineInPat = !con(!if(IsC_IMod1, (ins timm:$src2_modifiers), (ins)), (ins (Src2VT (WMMAVISrc Src2VT:$src2))));
- dag Src2InlineOutPat = !con(!if(IsIUXF32, (ins), !if(IsC_IMod1, (ins (VOP3PModsNegAbs $src2_modifiers)), (ins (i32 8)))), (ins Src2VT:$src2));
+ dag Src2InlineOutPat = !con(!if(IsIU, (ins), !if(IsC_IMod1, (ins (VOP3PModsNegAbs $src2_modifiers)), (ins (i32 8)))), (ins Src2VT:$src2));
dag MatrixScaleInPat = !if(HasMatrixScale, (ins timm:$matrix_a_scale, timm:$matrix_a_scale_fmt, ScaleTy:$scale_src0,
timm:$matrix_b_scale, timm:$matrix_b_scale_fmt, ScaleTy:$scale_src1),
(ins));
@@ -1674,17 +1695,17 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
dag MatrixReuseOutModPat = !if(HasMatrixReuse, (ins i1:$matrix_a_reuse, i1:$matrix_b_reuse), (ins));
dag WmmaInPat = !con(Src0InPat, Src1InPat, Src2InPatWmma, MatrixScaleInPat, MatrixReuseInPat, ClampPat);
- dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, MatrixScaleOutSrcPat, MatrixFMTOutPat,
- MatrixScaleOutModPat, MatrixReuseOutModPat, ClampPat);
+ dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, MatrixScaleOutSrcPat, ClampPat, MatrixFMTOutPat,
+ MatrixScaleOutModPat, MatrixReuseOutModPat);
dag SwmmacInPat = !con(Src0InPat, Src1InPat, (ins Src2VT:$srcTiedDef), IndexInPat, MatrixReuseInPat, ClampPat);
- dag SwmmacOutPat = !con(Src0OutPat, Src1OutPat, (ins Src2VT:$srcTiedDef), IndexOutPat, MatrixReuseOutModPat, ClampPat);
+ dag SwmmacOutPat = !con(Src0OutPat, Src1OutPat, (ins Src2VT:$srcTiedDef), IndexOutPat, ClampPat, MatrixReuseOutModPat);
// wmma pattern where src2 is inline imm uses _threeaddr pseudo,
// can't use _twoaddr since it would violate src2 tied to vdst constraint.
dag WmmaInlineInPat = !con(Src0InPat, Src1InPat, Src2InlineInPat, MatrixScaleInPat, MatrixReuseInPat, ClampPat);
- dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, MatrixScaleOutSrcPat,
- MatrixFMTOutPat, MatrixScaleOutModPat, MatrixReuseOutModPat, ClampPat);
+ dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, MatrixScaleOutSrcPat, ClampPat,
+ MatrixFMTOutPat, MatrixScaleOutModPat, MatrixReuseOutModPat);
}
def WMMAInstInfoTable : GenericTable {
@@ -1706,7 +1727,7 @@ multiclass WMMAInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string Pse
defvar WMMAConstraints2Addr = !if(DiffVdstSrc2, "@earlyclobber $vdst", "@earlyclobber $vdst,$vdst = $src2");
defvar WMMAConstraints3Addr = "@earlyclobber $vdst";
- let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
+ let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0, isConvergent = 1 in {
let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in
def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>, WMMAInstInfo {
let PseudoInstr = Instr#PseudoInstrSuffix;
@@ -1733,7 +1754,7 @@ multiclass SWMMACInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string P
let mayRaiseFPException = 0;
let ReadsModeReg = 0;
let AsmMatchConverter = "cvtSWMMAC";
-
+ let isConvergent = 1;
let Constraints = "@earlyclobber $vdst,$vdst = $srcTiedDef";
}
}
@@ -1756,84 +1777,126 @@ multiclass SWMMACInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string P
// Non-standard types (iu8, iu4, fp8, bf8) will be packed in vectors of i32s.
// Original type for them is in comment on the right and refers to A and B.
-def F32_F16_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8f16, v8f16, v8f32], 0, 0, 0, 0>;
-def F32_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8i16, v8i16, v8f32], 0, 0, 0, 0>;
-def F16_F16_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v8f16, v8f16, v8f16], 0, 0, 0, 0>;
-def BF16_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8i16, v8i16, v8i16, v8i16], 0, 0, 0, 0>;
-def I32_IU8_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v2i32, v8i32], 0, 0, 1, 0>; // 8xi8
-def I32_IU4X16_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, i32, i32, v8i32], 0, 0, 1, 0>; // 8xi4
-def F32_FP8BF8_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v2i32, v2i32, v8f32], 0, 0, 0, 1>; // 8xf8
-def I32_IU4X32_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v2i32, v8i32], 0, 0, 1, 0>; // 16xi4
-
-def F32_F16_WMMA_w64 : VOP3PWMMA_Profile<[v4f32, v4f16, v4f16, v4f32], 0, 0, 0, 0>;
-def F32_BF16_WMMA_w64 : VOP3PWMMA_Profile<[v4f32, v4i16, v4i16, v4f32], 0, 0, 0, 0>;
-def F16_F16_WMMA_w64 : VOP3PWMMA_Profile<[v4f16, v4f16, v4f16, v4f16], 0, 0, 0, 0>;
-def BF16_BF16_WMMA_w64 : VOP3PWMMA_Profile<[v4i16, v4i16, v4i16, v4i16], 0, 0, 0, 0>;
-def I32_IU8_WMMA_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], 0, 0, 1, 0>; // 4xi8
-def I32_IU4X16_WMMA_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], 0, 0, 1, 0>; // 8xi4 *
-def F32_FP8BF8_WMMA_w64 : VOP3PWMMA_Profile<[v4f32, i32, i32, v4f32], 0, 0, 0, 1>; // 4xf8
-def I32_IU4X32_WMMA_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], 0, 0, 1, 0>; // 8xi4
-
-def F32_F16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8f16, v16f16, v8f32], 1, 16, 0, 0>;
-def F32_BF16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8i16, v16i16, v8f32], 1, 16, 0, 0>;
-def F16_F16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v8f16, v16f16, v8f16], 1, 16, 0, 0>;
-def BF16_BF16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i16, v8i16, v16i16, v8i16], 1, 16, 0, 0>;
-def I32_IU8_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v4i32, v8i32], 1, 16, 1, 0>; // 8xi8, 16xi8
-def I32_IU4X32_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, i32, v2i32, v8i32], 1, 16, 1, 0>; // 8xi4, 16xi4
-def I32_IU4X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v4i32, v8i32], 1, 0, 1, 0>; // 16xi4, 32xi4 **
-def F32_FP8BF8_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v2i32, v4i32, v8f32], 1, 16, 0, 1>; // 8xf8, 16xf8
-
-def F32_F16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, v4f16, v8f16, v4f32], 1, 8, 0, 0>;
-def F32_BF16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, v4i16, v8i16, v4f32], 1, 8, 0, 0>;
-def F16_F16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f16, v4f16, v8f16, v4f16], 1, 8, 0, 0>;
-def BF16_BF16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i16, v4i16, v8i16, v4i16], 1, 8, 0, 0>;
-def I32_IU8_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i32, i32, v2i32, v4i32], 1, 8, 1, 0>; // 4xi8, 8xi8
-def I32_IU4X32_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], 1, 16, 1, 0>; // 8xi4, 8xi4 ***
-def I32_IU4X64_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i32, i32, v2i32, v4i32], 1, 16, 1, 0>; // 8xi4, 16xi4
-def F32_FP8BF8_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, i32, v2i32, v4f32], 1, 8, 0, 1>; // 4xf8, 8xf8
+def F32_F16_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8f16, v8f16, v8f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0>;
+def F32_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8i16, v8i16, v8f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0>;
+def F16_F16_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v8f16, v8f16, v8f16], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0>;
+def BF16_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8i16, v8i16, v8i16, v8i16], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0>;
+def I32_IU8_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v2i32, v8i32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/1, /*_IsFP8BF8=*/0>; // 8xi8
+def I32_IU4X16_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, i32, i32, v8i32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/1, /*_IsFP8BF8=*/0>; // 8xi4
+def F32_FP8BF8_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v2i32, v2i32, v8f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/1>; // 8xf8
+def I32_IU4X32_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v2i32, v8i32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/1, /*_IsFP8BF8=*/0>; // 16xi4
+
+def F32_F16_WMMA_w64 : VOP3PWMMA_Profile<[v4f32, v4f16, v4f16, v4f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0>;
+def F32_BF16_WMMA_w64 : VOP3PWMMA_Profile<[v4f32, v4i16, v4i16, v4f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0>;
+def F16_F16_WMMA_w64 : VOP3PWMMA_Profile<[v4f16, v4f16, v4f16, v4f16], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0>;
+def BF16_BF16_WMMA_w64 : VOP3PWMMA_Profile<[v4i16, v4i16, v4i16, v4i16], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0>;
+def I32_IU8_WMMA_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/1, /*_IsFP8BF8=*/0>; // 4xi8
+def I32_IU4X16_WMMA_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/1, /*_IsFP8BF8=*/0>; // 8xi4 *
+def F32_FP8BF8_WMMA_w64 : VOP3PWMMA_Profile<[v4f32, i32, i32, v4f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/1>; // 4xf8
+def I32_IU4X32_WMMA_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/1, /*_IsFP8BF8=*/0>; // 8xi4
+
+def F32_F16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8f16, v16f16, v8f32], /*_IsSWMMAC=*/1, /*_IndexType=*/16, /*_IsIU=*/0, /*_IsFP8BF8=*/0>;
+def F32_BF16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8i16, v16i16, v8f32], /*_IsSWMMAC=*/1, /*_IndexType=*/16, /*_IsIU=*/0, /*_IsFP8BF8=*/0>;
+def F16_F16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v8f16, v16f16, v8f16], /*_IsSWMMAC=*/1, /*_IndexType=*/16, /*_IsIU=*/0, /*_IsFP8BF8=*/0>;
+def BF16_BF16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i16, v8i16, v16i16, v8i16], /*_IsSWMMAC=*/1, /*_IndexType=*/16, /*_IsIU=*/0, /*_IsFP8BF8=*/0>;
+def I32_IU8_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v4i32, v8i32], /*_IsSWMMAC=*/1, /*_IndexType=*/16, /*_IsIU=*/1, /*_IsFP8BF8=*/0>; // 8xi8, 16xi8
+def I32_IU4X32_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, i32, v2i32, v8i32], /*_IsSWMMAC=*/1, /*_IndexType=*/16, /*_IsIU=*/1, /*_IsFP8BF8=*/0>; // 8xi4, 16xi4
+def I32_IU4X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v4i32, v8i32], /*_IsSWMMAC=*/1, /*_IndexType=*/0, /*_IsIU=*/1, /*_IsFP8BF8=*/0>; // 16xi4, 32xi4 **
+def F32_FP8BF8_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v2i32, v4i32, v8f32], /*_IsSWMMAC=*/1, /*_IndexType=*/16, /*_IsIU=*/0, /*_IsFP8BF8=*/1>; // 8xf8, 16xf8
+
+def F32_F16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, v4f16, v8f16, v4f32], /*_IsSWMMAC=*/1, /*_IndexType=*/8, /*_IsIU=*/0, /*_IsFP8BF8=*/0>;
+def F32_BF16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, v4i16, v8i16, v4f32], /*_IsSWMMAC=*/1, /*_IndexType=*/8, /*_IsIU=*/0, /*_IsFP8BF8=*/0>;
+def F16_F16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f16, v4f16, v8f16, v4f16], /*_IsSWMMAC=*/1, /*_IndexType=*/8, /*_IsIU=*/0, /*_IsFP8BF8=*/0>;
+def BF16_BF16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i16, v4i16, v8i16, v4i16], /*_IsSWMMAC=*/1, /*_IndexType=*/8, /*_IsIU=*/0, /*_IsFP8BF8=*/0>;
+def I32_IU8_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i32, i32, v2i32, v4i32], /*_IsSWMMAC=*/1, /*_IndexType=*/8, /*_IsIU=*/1, /*_IsFP8BF8=*/0>; // 4xi8, 8xi8
+def I32_IU4X32_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], /*_IsSWMMAC=*/1, /*_IndexType=*/16, /*_IsIU=*/1, /*_IsFP8BF8=*/0>; // 8xi4, 8xi4 ***
+def I32_IU4X64_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i32, i32, v2i32, v4i32], /*_IsSWMMAC=*/1, /*_IndexType=*/16, /*_IsIU=*/1, /*_IsFP8BF8=*/0>; // 8xi4, 16xi4
+def F32_FP8BF8_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, i32, v2i32, v4f32], /*_IsSWMMAC=*/1, /*_IndexType=*/8, /*_IsIU=*/0, /*_IsFP8BF8=*/1>; // 4xf8, 8xf8
// * IU4X16_WMMA_w64 lanes 0-31 will have 8xi4, remaining lanes are ignored
// ** IU4X64_SWMMAC_w32 index is i32, index_key is not used
// *** IU4X32_SWMMAC_w64 lanes 0-31 will have 8xi4 remaining lanes are ignored
// for matrix A, index is i16; Matrix B uses all lanes
-def F32_F32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v2f32, v2f32, v8f32], 0, 0, 0, 0, 1, 0, 0, 0, 1>;
-def F32_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 0, 0, 0, 1>;
-def F32_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v16f16, v8f32], 0, 0, 0, 0, 1, 0, 0, 0, 1>;
-def F16_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v16f16, v8f16], 0, 0, 0, 0, 1, 0, 0, 0, 1>;
-def BF16_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8bf16], 0, 0, 0, 0, 1, 0, 0, 0, 1>;
-def BF16F32_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 0, 0, 0, 1>;
-def F32_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v8i32, v8f32], 0, 0, 0, 1, 1, 0, 0, 0, 1>;
-def F32_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], 0, 0, 0, 1, 1, 0, 0, 0, 1>;
-def F16_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v8i32, v8f16], 0, 0, 0, 1, 1, 0, 0, 0, 1>;
-def F16_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16i32, v16i32, v8f16], 0, 0, 0, 1, 1, 0, 0, 0, 1>;
-def F32_32X16X128_F4_WMMA_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 0, 1, 0, 0, 0, 0, 1>;
-def I32_IU8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v8i32, v8i32], 0, 0, 1, 0, 1, 0, 0, 0, 1>;
-def F32_32X16X128_F4_SCALE_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 1, 1, 0, 1, 0, 1>;
-def F32_32X16X128_F4_SCALE16_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 1, 1, 0, 1, 1, 1>;
-def F32_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v32f16, v8f32], 1, 16, 0, 0, 1, 0, 0, 0, 1>;
-def F32_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v32bf16, v8f32], 1, 16, 0, 0, 1, 0, 0, 0, 1>;
-def F16_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v32f16, v8f16], 1, 16, 0, 0, 1, 0, 0, 0, 1>;
-def BF16_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v32bf16, v8bf16], 1, 16, 0, 0, 1, 0, 0, 0, 1>;
-def F32_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v16i32, v8f32], 1, 32, 0, 1, 1, 0, 0, 0, 1>;
-def F16_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v16i32, v8f16], 1, 32, 0, 1, 1, 0, 0, 0, 1>;
-def I32_IU8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v16i32, v8i32], 1, 32, 1, 0, 1, 0, 0, 0, 1>;
-
-multiclass WMMA_F8F6F4_Profiles<bit HasMatrixScale, bit Scale16, bit HasMatrixReuse> {
- def _f8_f8_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>;
- def _f8_f6_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>;
- def _f8_f4_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>;
- def _f6_f8_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>;
- def _f6_f6_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>;
- def _f6_f4_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>;
- def _f4_f8_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>;
- def _f4_f6_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>;
- def _f4_f4_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>;
-}
-
-defm F32_16X16X128_F8F6F4 : WMMA_F8F6F4_Profiles<0, 0, 0>;
-defm F32_16X16X128_F8F6F4_SCALE : WMMA_F8F6F4_Profiles<1, 0, 1>;
-defm F32_16X16X128_F8F6F4_SCALE16 : WMMA_F8F6F4_Profiles<1, 1, 1>;
+def F32_F32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v2f32, v2f32, v8f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0,
+ /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>;
+def F32_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v16bf16, v8f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0,
+ /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>;
+def F32_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v16f16, v8f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0,
+ /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>;
+def F16_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v16f16, v8f16], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0,
+ /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>;
+def BF16_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8bf16], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0,
+ /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>;
+def BF16F32_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0,
+ /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>;
+def F32_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v8i32, v8f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/1,
+ /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>;
+def F32_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/1,
+ /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>;
+def F16_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v8i32, v8f16], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/1,
+ /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>;
+def F16_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16i32, v16i32, v8f16], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/1,
+ /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>;
+def F32_32X16X128_F4_WMMA_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0,
+ /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/0, /*_IsF4=*/1>;
+def I32_IU8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v8i32, v8i32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/1, /*_IsFP8BF8=*/0,
+ /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>;
+def F32_32X16X128_F4_SCALE_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/1,
+ /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/1, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>;
+def F32_32X16X128_F4_SCALE16_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/1,
+ /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/1, /*_Scale16=*/1, /*_HasMatrixReuse=*/1>;
+def F32_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v32f16, v8f32], /*_IsSWMMAC=*/1, /*_IndexType=*/16, /*_IsIU=*/0, /*_IsFP8BF8=*/0,
+ /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>;
+def F32_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v32bf16, v8f32], /*_IsSWMMAC=*/1, /*_IndexType=*/16, /*_IsIU=*/0, /*_IsFP8BF8=*/0,
+ /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>;
+def F16_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v32f16, v8f16], /*_IsSWMMAC=*/1, /*_IndexType=*/16, /*_IsIU=*/0, /*_IsFP8BF8=*/0,
+ /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>;
+def BF16_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v32bf16, v8bf16], /*_IsSWMMAC=*/1, /*_IndexType=*/16, /*_IsIU=*/0, /*_IsFP8BF8=*/0,
+ /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>;
+def F32_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v16i32, v8f32], /*_IsSWMMAC=*/1, /*_IndexType=*/32, /*_IsIU=*/0, /*_IsFP8BF8=*/1,
+ /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>;
+def F16_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v16i32, v8f16], /*_IsSWMMAC=*/1, /*_IndexType=*/32, /*_IsIU=*/0, /*_IsFP8BF8=*/1,
+ /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>;
+def I32_IU8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v16i32, v8i32], /*_IsSWMMAC=*/1, /*_IndexType=*/32, /*_IsIU=*/1, /*_IsFP8BF8=*/0,
+ /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>;
+
+// Helper class to compute the destination vector type of WMMA_F8F6F4 instructions based on element type and dimensions.
+class getWMMAF8F6F4DstVTy<ValueType DstEltTy, int M, int N> {
+ // Size in bits = (M * N / 32) * element_size_in_bits
+ defvar Size = !mul(!div(!mul(M, N), 32), DstEltTy.Size);
+ ValueType ret = !cond(!eq(Size, 256) : v8f32,
+ !eq(Size, 1024) : v64f16);
+}
+
+// Helper class to compute the type of matrix A and B of WMMA_F8F6F4 instructions based on format and dimensions.
+class getWMMAF8F6F4ABVTy<string Fmt, int D1, int D2> {
+ defvar FmtBits = !cond(!eq(Fmt, "f8") : 8,
+ !eq(Fmt, "f6") : 6,
+ !eq(Fmt, "f4") : 4);
+ // TypeSize in bits = (D1 * D2 / 32) * format_bits
+ defvar TypeSize = !mul(!div(!mul(D1, D2), 32), FmtBits);
+ ValueType ret = !cond(!eq(TypeSize, 256) : v8i32,
+ !eq(TypeSize, 384) : v12i32,
+ !eq(TypeSize, 512) : v16i32,
+ !eq(TypeSize, 1024) : v32i32);
+}
+
+multiclass WMMA_F8F6F4_Profiles<ValueType DstEltTy, int M, int N, int K,
+ bit HasMatrixScale, bit Scale16, bit HasMatrixReuse> {
+ defvar DstTy = getWMMAF8F6F4DstVTy<DstEltTy, M, N>.ret;
+ foreach ATy = ["f8", "f6", "f4"] in {
+ foreach BTy = ["f8", "f6", "f4"] in {
+ def _#ATy#_#BTy#_w32 : VOP3PWMMA_Profile<
+ [DstTy, getWMMAF8F6F4ABVTy<ATy, M, K>.ret, getWMMAF8F6F4ABVTy<BTy, K, N>.ret, DstTy],
+ 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>;
+ }
+ }
+}
+
+defm F32_16X16X128_F8F6F4 : WMMA_F8F6F4_Profiles<f32, /*M=*/16, /*N=*/16, /*K=*/128, /*HasMatrixScale=*/0, /*Scale16=*/0, /*HasMatrixReuse=*/0>;
+defm F32_16X16X128_F8F6F4_SCALE : WMMA_F8F6F4_Profiles<f32, /*M=*/16, /*N=*/16, /*K=*/128, /*HasMatrixScale=*/1, /*Scale16=*/0, /*HasMatrixReuse=*/1>;
+defm F32_16X16X128_F8F6F4_SCALE16 : WMMA_F8F6F4_Profiles<f32, /*M=*/16, /*N=*/16, /*K=*/128, /*HasMatrixScale=*/1, /*Scale16=*/1, /*HasMatrixReuse=*/1>;
class VOP_WMMA_LD_SCALE<ValueType vt, RegisterOperand RC> : VOP3P_Profile<VOPProfile<[untyped, vt, vt, untyped]>> {
let HasMatrixScale = 1;
@@ -1905,8 +1968,10 @@ defm V_WMMA_SCALE_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_scale_f32_32x16
defm V_WMMA_SCALE16_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_scale16_f32_32x16x128_f4", F32_32X16X128_F4_SCALE16_w32, "_w32">;
} // End is_wmma_xdl = 1.
-defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3PInst<"v_wmma_ld_scale_paired_b32", VOP_WMMA_LD_SCALE<i32, VCSrc_b32_Lo256>>;
-defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3PInst<"v_wmma_ld_scale16_paired_b64", VOP_WMMA_LD_SCALE<i64, VCSrc_b64_Lo256>>;
+let isConvergent = 1 in {
+ defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3PInst<"v_wmma_ld_scale_paired_b32", VOP_WMMA_LD_SCALE<i32, VCSrc_b32_Lo256>>;
+ defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3PInst<"v_wmma_ld_scale16_paired_b64", VOP_WMMA_LD_SCALE<i64, VCSrc_b64_Lo256>>;
+}
} // End SubtargetPredicate = isGFX125xOnly
} // End WaveSizePredicate = isWave32
@@ -2182,20 +2247,23 @@ multiclass VOP3P_Real_WMMA_gfx1250 <bits<8> op, VOP3PWMMA_Profile WMMAP> {
}
}
-multiclass VOP3P_Real_WMMA_F8F6F4_gfx1250<bits<8> op, VOP3PWMMA_Profile WMMAP> {
+multiclass VOP3P_Real_WMMA_F8F6F4<string Gen, bits<8> op, VOP3PWMMA_Profile WMMAP> {
defvar PS = !cast<VOP3P_Pseudo>(NAME # "_twoaddr");
defvar asmName = !substr(PS.Mnemonic, 0, !sub(!size(PS.Mnemonic), !size("_f8_f8_w32")));
defvar psName = !substr(NAME, 0, !sub(!size(PS.Mnemonic), !size("_f8_f8_w32")));
- let AsmString = asmName # PS.AsmOperands in
- defm NAME : VOP3P_Real_WMMA_gfx1250<op, WMMAP>,
- MFMA_F8F6F4_WithSizeTable_Helper<PS, psName # "_f8_f8_w32_twoaddr_gfx1250">;
+ let AsmString = asmName # PS.AsmOperands in {
+ if !eq(Gen, "gfx1250") then {
+ defm NAME : VOP3P_Real_WMMA_gfx1250<op, WMMAP>,
+ MFMA_F8F6F4_WithSizeTable_Helper<PS, psName # "_f8_f8_w32_twoaddr_" # Gen>;
+ }
+ }
}
-multiclass VOP3P_Real_WMMA_gfx1250_SrcFormats<bits<8> op, string WMMAP> {
- defm _f8_f8_w32 : VOP3P_Real_WMMA_F8F6F4_gfx1250<op, !cast<VOP3PWMMA_Profile>(WMMAP # "_f8_f8_w32")>;
+multiclass VOP3P_Real_WMMA_SrcFormats<string Gen, bits<8> op, string WMMAP> {
+ defm _f8_f8_w32 : VOP3P_Real_WMMA_F8F6F4<Gen, op, !cast<VOP3PWMMA_Profile>(WMMAP # "_f8_f8_w32")>;
foreach I = ["f8_f6", "f8_f4", "f6_f8", "f6_f6", "f6_f4", "f4_f8", "f4_f6", "f4_f4"] in {
let isAsmParserOnly = true in { // Disable ambiguous disassembly.
- defm _#I#_w32 : VOP3P_Real_WMMA_F8F6F4_gfx1250<op, !cast<VOP3PWMMA_Profile>(WMMAP # "_" # I # "_w32")>;
+ defm _#I#_w32 : VOP3P_Real_WMMA_F8F6F4<Gen, op, !cast<VOP3PWMMA_Profile>(WMMAP # "_" # I # "_w32")>;
}
}
}
@@ -2215,7 +2283,7 @@ class VOP3PX2e <bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile P> : Enc128, VO
let Inst{23-16} = LdScaleOp;
let Inst{40-32} = scale_src0;
let Inst{49-41} = scale_src1;
- let Inst{58-50} = 0; // scale src2
+ let Inst{58-50} = ?; // scale src2
let Inst{59} = matrix_b_scale{0}; // scale_op_sel_hi(0)
let Inst{60} = 0; // scale_op_sel_hi(1)
let Inst{63-61} = {0, matrix_a_scale_fmt{1-0}}; // neg (lo)
@@ -2234,9 +2302,9 @@ class VOP3PX2e <bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile P> : Enc128, VO
let Inst{87-80} = op;
let Inst{95-88} = 0xcc; //encoding
- let Inst{104-96} = !if(P.HasSrc0, src0, 0);
- let Inst{113-105} = !if(P.HasSrc1, src1, 0);
- let Inst{122-114} = !if(P.HasSrc2, src2, 0);
+ let Inst{104-96} = !if(P.HasSrc0, src0, ?);
+ let Inst{113-105} = !if(P.HasSrc1, src1, ?);
+ let Inst{122-114} = !if(P.HasSrc2, src2, ?);
// neg_lo
let Inst{125} = !if(P.NegLo01, src0_modifiers{0}, 0);
@@ -2244,34 +2312,35 @@ class VOP3PX2e <bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile P> : Enc128, VO
let Inst{127} = !if(P.NegLo2, src2_modifiers{0}, 0);
}
-multiclass VOP3PX2_Real_ScaledWMMA_F4<bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile WMMAP> {
- defvar PS = !cast<VOP3P_Pseudo>(NAME # "_twoaddr");
- let SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32,
- DecoderNamespace = "GFX1250" in {
+multiclass VOP3PX2_Real_ScaledWMMA_F4<string Gen, bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile WMMAP> {
+ defvar PS = !cast<VOP3P_Pseudo>(NAME # "_twoaddr");
+ if !eq(Gen, "gfx1250") then {
def _gfx1250 : VOP3P_Real_Gen<PS, GFX1250Gen, PS.Mnemonic>,
- VOP3PX2e <op, LdScaleOp, WMMAP>;
+ VOP3PX2e <op, LdScaleOp, WMMAP> {
+ let PostEncoderMethod = "postEncodeVOP3<true, true, false>";
+ }
}
}
-multiclass VOP3PX2_Real_ScaledWMMA<bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile WMMAP> {
+multiclass VOP3PX2_Real_ScaledWMMA<string Gen, bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile WMMAP> {
defvar PS = !cast<VOP3P_Pseudo>(NAME # "_twoaddr");
defvar asmName = !substr(PS.Mnemonic, 0, !sub(!size(PS.Mnemonic), !size("_f8_f8_w32")));
defvar psName = !substr(NAME, 0, !sub(!size(PS.Mnemonic), !size("_f8_f8_w32")));
- let SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32,
- DecoderNamespace = "GFX1250" in {
+ if !eq(Gen, "gfx1250") then {
def _gfx1250 : VOP3P_Real_Gen<PS, GFX1250Gen, asmName>,
VOP3PX2e <op, LdScaleOp, WMMAP>,
- MFMA_F8F6F4_WithSizeTable_Helper<PS, psName # "_f8_f8_w32_gfx1250"> {
+ MFMA_F8F6F4_WithSizeTable_Helper<PS, psName # "_f8_f8_w32_" # Gen> {
let AsmString = asmName # PS.AsmOperands;
+ let PostEncoderMethod = "postEncodeVOP3<true, true, false>";
}
}
}
-multiclass VOP3PX2_Real_ScaledWMMA_SrcFormats<bits<8> op, bits<8> LdScaleOp, string WMMAP> {
- defm _f8_f8_w32 : VOP3PX2_Real_ScaledWMMA<op, LdScaleOp, !cast<VOP3PWMMA_Profile>(WMMAP # "_f8_f8_w32")>;
+multiclass VOP3PX2_Real_ScaledWMMA_SrcFormats<string Gen, bits<8> op, bits<8> LdScaleOp, string WMMAP> {
+ defm _f8_f8_w32 : VOP3PX2_Real_ScaledWMMA<Gen, op, LdScaleOp, !cast<VOP3PWMMA_Profile>(WMMAP # "_f8_f8_w32")>;
foreach I = ["f8_f6", "f8_f4", "f6_f8", "f6_f6", "f6_f4", "f4_f8", "f4_f6", "f4_f4"] in {
let isAsmParserOnly = true in { // Disable ambiguous disassembly.
- defm _#I#_w32 : VOP3PX2_Real_ScaledWMMA<op, LdScaleOp, !cast<VOP3PWMMA_Profile>(WMMAP # "_" # I # "_w32")>;
+ defm _#I#_w32 : VOP3PX2_Real_ScaledWMMA<Gen, op, LdScaleOp, !cast<VOP3PWMMA_Profile>(WMMAP # "_" # I # "_w32")>;
}
}
}
@@ -2350,12 +2419,14 @@ defm V_WMMA_F16_16X16X128_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x086, F16_FP8B
defm V_WMMA_F16_16X16X128_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x087, F16_FP8BF8X128_WMMA_w32>;
defm V_WMMA_F32_32X16X128_F4_w32 : VOP3P_Real_WMMA_gfx1250 <0x088, F32_32X16X128_F4_WMMA_w32>;
-defm V_WMMA_F32_16X16X128_F8F6F4 : VOP3P_Real_WMMA_gfx1250_SrcFormats<0x033, "F32_16X16X128_F8F6F4">;
-defm V_WMMA_SCALE_F32_16X16X128_F8F6F4 : VOP3PX2_Real_ScaledWMMA_SrcFormats<0x033, 0x35, "F32_16X16X128_F8F6F4_SCALE">;
-defm V_WMMA_SCALE16_F32_16X16X128_F8F6F4 : VOP3PX2_Real_ScaledWMMA_SrcFormats<0x033, 0x3a, "F32_16X16X128_F8F6F4_SCALE16">;
+let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX1250Plus, DecoderNamespace = "GFX1250" in {
+defm V_WMMA_F32_16X16X128_F8F6F4 : VOP3P_Real_WMMA_SrcFormats <"gfx1250", 0x033, "F32_16X16X128_F8F6F4">;
+defm V_WMMA_SCALE_F32_16X16X128_F8F6F4 : VOP3PX2_Real_ScaledWMMA_SrcFormats <"gfx1250", 0x033, 0x35, "F32_16X16X128_F8F6F4_SCALE">;
+defm V_WMMA_SCALE16_F32_16X16X128_F8F6F4 : VOP3PX2_Real_ScaledWMMA_SrcFormats <"gfx1250", 0x033, 0x3a, "F32_16X16X128_F8F6F4_SCALE16">;
-defm V_WMMA_SCALE_F32_32X16X128_F4_w32 : VOP3PX2_Real_ScaledWMMA_F4<0x088, 0x35, F32_32X16X128_F4_SCALE_w32>;
-defm V_WMMA_SCALE16_F32_32X16X128_F4_w32 : VOP3PX2_Real_ScaledWMMA_F4<0x088, 0x3a, F32_32X16X128_F4_SCALE16_w32>;
+defm V_WMMA_SCALE_F32_32X16X128_F4_w32 : VOP3PX2_Real_ScaledWMMA_F4 <"gfx1250", 0x088, 0x35, F32_32X16X128_F4_SCALE_w32>;
+defm V_WMMA_SCALE16_F32_32X16X128_F4_w32 : VOP3PX2_Real_ScaledWMMA_F4 <"gfx1250", 0x088, 0x3a, F32_32X16X128_F4_SCALE16_w32>;
+} // End WaveSizePredicate = isWave32, SubtargetPredicate = isGFX1250Plus, DecoderNamespace = "GFX1250"
defm V_SWMMAC_F32_16X16X64_F16_w32 : VOP3P_Real_WMMA_gfx1250 <0x065, F32_F16X64_SWMMAC_w32>;
defm V_SWMMAC_F32_16X16X64_BF16_w32 : VOP3P_Real_WMMA_gfx1250 <0x066, F32_BF16X64_SWMMAC_w32>;
@@ -2417,6 +2488,11 @@ multiclass VOP3P_Realtriple<GFXGen Gen, bits<8> op, string backing_ps_name = NAM
multiclass VOP3P_Realtriple_gfx11_gfx12<bits<8> op>
: VOP3P_Realtriple<GFX11Gen, op>, VOP3P_Realtriple<GFX12Gen, op>;
+defm V_DOT4_F32_FP8_BF8 : VOP3P_Realtriple_gfx11_gfx12<0x24>;
+defm V_DOT4_F32_BF8_FP8 : VOP3P_Realtriple_gfx11_gfx12<0x25>;
+defm V_DOT4_F32_FP8_FP8 : VOP3P_Realtriple_gfx11_gfx12<0x26>;
+defm V_DOT4_F32_BF8_BF8 : VOP3P_Realtriple_gfx11_gfx12<0x27>;
+
//===----------------------------------------------------------------------===//
// GFX12
//===----------------------------------------------------------------------===//
@@ -2459,8 +2535,10 @@ defm V_FMA_MIX_F32_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3d>;
defm V_FMA_MIXLO_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3e>;
defm V_FMA_MIXHI_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3f>;
-defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3P_Real_gfx1250<0x35>;
-defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3P_Real_gfx1250<0x3a>;
+let PostEncoderMethod = "postEncodeVOP3<true, true, false>" in {
+ defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3P_Real_gfx1250<0x35>;
+ defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3P_Real_gfx1250<0x3a>;
+}
let AssemblerPredicate = isGFX1250Plus in
def : AMDGPUMnemonicAlias<"v_fma_mix_f32_f16", "v_fma_mix_f32">;
@@ -2468,10 +2546,6 @@ def : AMDGPUMnemonicAlias<"v_fma_mix_f32_f16", "v_fma_mix_f32">;
defm V_PK_MINIMUM_F16 : VOP3P_Real_gfx12<0x1d>;
defm V_PK_MAXIMUM_F16 : VOP3P_Real_gfx12<0x1e>;
-defm V_DOT4_F32_FP8_BF8 : VOP3P_Realtriple<GFX12Gen, 0x24>;
-defm V_DOT4_F32_BF8_FP8 : VOP3P_Realtriple<GFX12Gen, 0x25>;
-defm V_DOT4_F32_FP8_FP8 : VOP3P_Realtriple<GFX12Gen, 0x26>;
-defm V_DOT4_F32_BF8_BF8 : VOP3P_Realtriple<GFX12Gen, 0x27>;
//===----------------------------------------------------------------------===//
// GFX11
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index 2730ec5..989181b 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -24,7 +24,7 @@ class VOPC_SDWAe <bits<8> op, VOPProfile P> : VOP_SDWAe <P> {
bits<8> src1;
let Inst{8-0} = 0xf9; // sdwa
- let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, 0);
+ let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, ?);
let Inst{24-17} = op;
let Inst{31-25} = 0x3e; // encoding
}
@@ -33,10 +33,10 @@ class VOPC_SDWA9e <bits<8> op, VOPProfile P> : VOP_SDWA9Be <P> {
bits<9> src1;
let Inst{8-0} = 0xf9; // sdwa
- let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, 0);
+ let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, ?);
let Inst{24-17} = op;
let Inst{31-25} = 0x3e; // encoding
- let Inst{63} = !if(P.HasSrc1, src1{8}, 0); // src1_sgpr
+ let Inst{63} = !if(P.HasSrc1, src1{8}, ?); // src1_sgpr
}
@@ -422,7 +422,6 @@ multiclass VOPC_Pseudos <string opName,
}
-let SubtargetPredicate = HasSdstCMPX in {
multiclass VOPCX_Pseudos <string opName,
VOPC_Profile P, VOPC_Profile P_NoSDst,
SDPatternOperator cond = COND_NULL,
@@ -486,7 +485,6 @@ multiclass VOPCX_Pseudos <string opName,
}
} // end SubtargetPredicate = isGFX11Plus
}
-} // End SubtargetPredicate = HasSdstCMPX
defm VOPC_I1_F16_F16 : VOPC_Profile_t16<[Write32Bit], f16>;
def VOPC_I1_F32_F32 : VOPC_Profile<[Write32Bit], f32>;
@@ -518,8 +516,10 @@ multiclass VOPC_F16 <string opName, SDPatternOperator cond = COND_NULL,
multiclass VOPC_F32 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
VOPC_Pseudos <opName, VOPC_I1_F32_F32, cond, revOp, 0>;
-multiclass VOPC_F64 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
- VOPC_Pseudos <opName, VOPC_I1_F64_F64, cond, revOp, 0>;
+let IsDPMACCInstruction = 1 in {
+ multiclass VOPC_F64 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
+ VOPC_Pseudos <opName, VOPC_I1_F64_F64, cond, revOp, 0>;
+}
multiclass VOPC_I16 <string opName, SDPatternOperator cond = COND_NULL,
string revOp = opName> {
@@ -537,9 +537,10 @@ multiclass VOPC_I16 <string opName, SDPatternOperator cond = COND_NULL,
multiclass VOPC_I32 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
VOPC_Pseudos <opName, VOPC_I1_I32_I32, cond, revOp, 0>;
-multiclass VOPC_I64 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
- VOPC_Pseudos <opName, VOPC_I1_I64_I64, cond, revOp, 0>;
-
+let IsDPMACCInstruction = 1 in {
+ multiclass VOPC_I64 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
+ VOPC_Pseudos <opName, VOPC_I1_I64_I64, cond, revOp, 0>;
+}
multiclass VOPCX_F16<string opName, string revOp = opName> {
let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in {
@@ -556,8 +557,10 @@ multiclass VOPCX_F16<string opName, string revOp = opName> {
multiclass VOPCX_F32 <string opName, string revOp = opName> :
VOPCX_Pseudos <opName, VOPC_I1_F32_F32, VOPC_F32_F32, COND_NULL, revOp>;
-multiclass VOPCX_F64 <string opName, string revOp = opName> :
- VOPCX_Pseudos <opName, VOPC_I1_F64_F64, VOPC_F64_F64, COND_NULL, revOp>;
+let IsDPMACCInstruction = 1 in {
+ multiclass VOPCX_F64 <string opName, string revOp = opName> :
+ VOPCX_Pseudos <opName, VOPC_I1_F64_F64, VOPC_F64_F64, COND_NULL, revOp>;
+}
multiclass VOPCX_I16<string opName, string revOp = opName> {
let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in {
@@ -574,8 +577,10 @@ multiclass VOPCX_I16<string opName, string revOp = opName> {
multiclass VOPCX_I32 <string opName, string revOp = opName> :
VOPCX_Pseudos <opName, VOPC_I1_I32_I32, VOPC_I32_I32, COND_NULL, revOp>;
-multiclass VOPCX_I64 <string opName, string revOp = opName> :
- VOPCX_Pseudos <opName, VOPC_I1_I64_I64, VOPC_I64_I64, COND_NULL, revOp>;
+let IsDPMACCInstruction = 1 in {
+ multiclass VOPCX_I64 <string opName, string revOp = opName> :
+ VOPCX_Pseudos <opName, VOPC_I1_I64_I64, VOPC_I64_I64, COND_NULL, revOp>;
+}
//===----------------------------------------------------------------------===//
@@ -1114,7 +1119,6 @@ multiclass VOPC_Class_Pseudos <string opName, VOPC_Profile p, bit DefExec,
} // end SubtargetPredicate = isGFX11Plus
}
-let SubtargetPredicate = HasSdstCMPX in {
multiclass VOPCX_Class_Pseudos <string opName,
VOPC_Profile P,
VOPC_Profile P_NoSDst> :
@@ -1164,7 +1168,6 @@ multiclass VOPCX_Class_Pseudos <string opName,
}
} // end SubtargetPredicate = isGFX11Plus
}
-} // End SubtargetPredicate = HasSdstCMPX
} // End ReadsModeReg = 0, mayRaiseFPException = 0
defm VOPC_I1_F16_I16 : VOPC_Class_Profile_t16<[Write32Bit]>;
@@ -1210,11 +1213,13 @@ multiclass VOPC_CLASS_F32 <string opName> {
multiclass VOPCX_CLASS_F32 <string opName> :
VOPCX_Class_Pseudos <opName, VOPC_I1_F32_I32, VOPC_F32_I32>;
+// FIXME: let IsDPMACCInstruction = 1 in
multiclass VOPC_CLASS_F64 <string opName> {
defm NAME : VOPC_Class_Pseudos <opName, VOPC_I1_F64_I32, 0>;
defm : VOPCClassPat64<NAME>;
}
+// FIXME: let IsDPMACCInstruction = 1 in
multiclass VOPCX_CLASS_F64 <string opName> :
VOPCX_Class_Pseudos <opName, VOPC_I1_F64_I32, VOPC_F64_I32>;
@@ -1233,18 +1238,12 @@ defm V_CMPX_CLASS_F16 : VOPCX_CLASS_F16 <"v_cmpx_class_f16">;
// We need to use COPY_TO_REGCLASS to w/a the problem when ReplaceAllUsesWith()
// complaints it cannot replace i1 <-> i64/i32 if node was not morphed in place.
multiclass ICMP_Pattern <PatFrags cond, Instruction inst, ValueType vt, dag dstInst = (inst $src0, $src1)> {
- let WaveSizePredicate = isWave64 in
def : GCNPat <
- (i64 (AMDGPUsetcc vt:$src0, vt:$src1, cond)),
- (i64 (COPY_TO_REGCLASS dstInst, SReg_64))
+ (WaveSizeVT (AMDGPUsetcc vt:$src0, vt:$src1, cond)),
+ dstInst
>;
let WaveSizePredicate = isWave32 in {
- def : GCNPat <
- (i32 (AMDGPUsetcc vt:$src0, vt:$src1, cond)),
- (i32 (COPY_TO_REGCLASS dstInst, SReg_32))
- >;
-
// Support codegen of i64 setcc in wave32 mode.
def : GCNPat <
(i64 (AMDGPUsetcc vt:$src0, vt:$src1, cond)),
@@ -1459,7 +1458,7 @@ class VOPC_DPP_Base<bits<8> op, string OpName, VOPProfile P>
let Inst{8-0} = 0xfa;
- let Inst{39-32} = !if (P.HasSrc0, src0{7-0}, 0);
+ let Inst{39-32} = !if (P.HasSrc0, src0{7-0}, ?);
let Inst{48-40} = dpp_ctrl;
let Inst{50} = fi;
let Inst{51} = bound_ctrl;
@@ -1485,7 +1484,7 @@ class VOPC_DPP8_Base<bits<8> op, string OpName, VOPProfile P>
let Inst{8-0} = fi;
- let Inst{39-32} = !if (P.HasSrc0, src0{7-0}, 0);
+ let Inst{39-32} = !if (P.HasSrc0, src0{7-0}, ?);
let Inst{63-40} = dpp8{23-0};
let AsmMatchConverter = "cvtDPP8";
@@ -1535,6 +1534,8 @@ class VOPC64_DPP<VOP_DPP_Pseudo ps, string opName = ps.OpName>
let Uses = ps.Uses;
let OtherPredicates = ps.OtherPredicates;
let Constraints = ps.Constraints;
+
+ let PostEncoderMethod = !if(!empty(ps.Defs), "", "postEncodeVOPCX");
}
class VOPC64_DPP16_Dst<bits<10> op, VOP_DPP_Pseudo ps,
@@ -1575,6 +1576,8 @@ class VOPC64_DPP8<VOP_Pseudo ps, string opName = ps.OpName>
let Uses = ps.Uses;
let OtherPredicates = ps.OtherPredicates;
let True16Predicate = ps.True16Predicate;
+
+ let PostEncoderMethod = !if(!empty(ps.Defs), "", "postEncodeVOPCX");
}
class VOPC64_DPP8_Dst<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
@@ -1777,6 +1780,7 @@ multiclass VOPCX_Real<GFXGen Gen, bits<9> op> {
let Inst{7-0} = ?; // sdst
let AsmString = !subst("_nosdst", "", ps64.Mnemonic)
# "{_e64} " # ps64.AsmOperands;
+ let PostEncoderMethod = "postEncodeVOPCX";
}
defm : VOPCXInstAliases<NAME, !substr(Gen.Suffix, 1)>;
@@ -1838,6 +1842,7 @@ multiclass VOPCX_Real_with_name<GFXGen Gen, bits<9> op, string OpName,
let Inst{7-0} = ?; // sdst
let Inst{14} = 0;
let AsmString = asm_name # "{_e64} " # ps64.AsmOperands;
+ let PostEncoderMethod = "postEncodeVOPCX";
}
} else {
def _e64#Gen.Suffix
@@ -1845,6 +1850,7 @@ multiclass VOPCX_Real_with_name<GFXGen Gen, bits<9> op, string OpName,
VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> {
let Inst{7-0} = ?; // sdst
let AsmString = asm_name # "{_e64} " # ps64.AsmOperands;
+ let PostEncoderMethod = "postEncodeVOPCX";
}
}
@@ -2186,6 +2192,7 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
let Inst{7-0} = ?; // sdst
let AsmString = !subst("_nosdst", "", !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").Mnemonic)
# "{_e64} " # !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").AsmOperands;
+ let PostEncoderMethod = "postEncodeVOPCX";
}
if !cast<VOPC_Pseudo>(NAME#"_nosdst_e32").Pfl.HasExtSDWA9 then
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 8325c62..09fdb00 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -18,6 +18,7 @@ class LetDummies {
bit isConvergent;
bit isAsCheapAsAMove;
bit FPDPRounding;
+ bit IsDPMACCInstruction;
Predicate SubtargetPredicate;
string Constraints;
string DisableEncoding;
@@ -71,6 +72,7 @@ class VOP_Pseudo <string opName, string suffix, VOPProfile P, dag outs, dag ins,
string Mnemonic = opName;
Instruction Opcode = !cast<Instruction>(NAME);
bit IsTrue16 = P.IsTrue16;
+ bit IsDPMACCInstruction = 0;
VOPProfile Pfl = P;
string AsmOperands;
@@ -166,6 +168,7 @@ class VOP3P_Pseudo <string opName, VOPProfile P, list<dag> pattern = []> :
class VOP_Real<VOP_Pseudo ps> {
Instruction Opcode = !cast<Instruction>(NAME);
bit IsSingle = ps.Pfl.IsSingle;
+ bit IsDPMACCInstruction = ps.IsDPMACCInstruction;
}
class VOP3_Real <VOP_Pseudo ps, int EncodingFamily, string asm_name = ps.Mnemonic> :
@@ -198,6 +201,8 @@ class VOP3_Real <VOP_Pseudo ps, int EncodingFamily, string asm_name = ps.Mnemoni
let isConvergent = ps.isConvergent;
VOPProfile Pfl = ps.Pfl;
+
+ let PostEncoderMethod = !if(!and(Pfl.HasSrc0, Pfl.HasSrc1, Pfl.HasSrc2), "", "postEncodeVOP3<"#Pfl.HasSrc0#","#Pfl.HasSrc1#","#Pfl.HasSrc2#">");
}
class VOP3_Real_Gen <VOP_Pseudo ps, GFXGen Gen, string asm_name = ps.Mnemonic> :
@@ -238,9 +243,9 @@ class VOP3a<VOPProfile P> : Enc64 {
let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0);
let Inst{31-26} = 0x34; //encoding
- let Inst{40-32} = !if(P.HasSrc0, src0, 0);
- let Inst{49-41} = !if(P.HasSrc1, src1, 0);
- let Inst{58-50} = !if(P.HasSrc2, src2, 0);
+ let Inst{40-32} = !if(P.HasSrc0, src0, ?);
+ let Inst{49-41} = !if(P.HasSrc1, src1, ?);
+ let Inst{58-50} = !if(P.HasSrc2, src2, ?);
let Inst{60-59} = !if(P.HasOMod, omod, 0);
let Inst{61} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0);
let Inst{62} = !if(P.HasSrc1Mods, src1_modifiers{0}, 0);
@@ -273,9 +278,9 @@ class VOP3a_t16<VOPProfile P> : Enc64 {
let Inst{15} = !if(P.HasClamp, clamp{0}, 0);
let Inst{31-26} = 0x35;
- let Inst{40-32} = !if(P.HasSrc0, src0{8-0}, 0);
- let Inst{49-41} = !if(P.HasSrc1, src1{8-0}, 0);
- let Inst{58-50} = !if(P.HasSrc2, src2{8-0}, 0);
+ let Inst{40-32} = !if(P.HasSrc0, src0{8-0}, ?);
+ let Inst{49-41} = !if(P.HasSrc1, src1{8-0}, ?);
+ let Inst{58-50} = !if(P.HasSrc2, src2{8-0}, ?);
let Inst{60-59} = !if(P.HasOMod, omod, 0);
let Inst{61} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0);
let Inst{62} = !if(P.HasSrc1Mods, src1_modifiers{0}, 0);
@@ -457,9 +462,9 @@ class VOP3be <VOPProfile P> : Enc64 {
let Inst{7-0} = vdst;
let Inst{14-8} = sdst;
let Inst{31-26} = 0x34; //encoding
- let Inst{40-32} = !if(P.HasSrc0, src0, 0);
- let Inst{49-41} = !if(P.HasSrc1, src1, 0);
- let Inst{58-50} = !if(P.HasSrc2, src2, 0);
+ let Inst{40-32} = !if(P.HasSrc0, src0, ?);
+ let Inst{49-41} = !if(P.HasSrc1, src1, ?);
+ let Inst{58-50} = !if(P.HasSrc2, src2, ?);
let Inst{60-59} = !if(P.HasOMod, omod, 0);
let Inst{61} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0);
let Inst{62} = !if(P.HasSrc1Mods, src1_modifiers{0}, 0);
@@ -509,9 +514,9 @@ class VOP3Pe <VOPProfile P> : Enc64, VOP3Pe_Base {
let Inst{15} = !if(P.HasClamp, clamp{0}, 0);
- let Inst{40-32} = !if(P.HasSrc0, src0, 0);
- let Inst{49-41} = !if(P.HasSrc1, src1, 0);
- let Inst{58-50} = !if(P.HasSrc2, src2, 0);
+ let Inst{40-32} = !if(P.HasSrc0, src0, ?);
+ let Inst{49-41} = !if(P.HasSrc1, src1, ?);
+ let Inst{58-50} = !if(P.HasSrc2, src2, ?);
let Inst{59} = !cond(!and(P.HasSrc0, P.HasOpSel) : src0_modifiers{3},
P.IsDOT : 1,
P.HasMatrixScale : matrix_b_scale{0},
@@ -546,12 +551,12 @@ class VOP3Pe_MAI <bits<7> op, VOPProfile P, bit acc_cd = 0> : Enc64, VOP3Pe_MAI_
let Inst{22-16} = op;
let Inst{31-23} = 0x1a7; //encoding
- let Inst{40-32} = !if(P.HasSrc0, src0{8-0}, 0);
- let Inst{49-41} = !if(P.HasSrc1, src1{8-0}, 0);
- let Inst{58-50} = !if(P.HasSrc2, src2, 0);
+ let Inst{40-32} = !if(P.HasSrc0, src0{8-0}, ?);
+ let Inst{49-41} = !if(P.HasSrc1, src1{8-0}, ?);
+ let Inst{58-50} = !if(P.HasSrc2, src2, ?);
- let Inst{59} = !if(P.HasSrc0, src0{9}, 0); // acc(0)
- let Inst{60} = !if(P.HasSrc1, src1{9}, 0); // acc(1)
+ let Inst{59} = !if(P.HasSrc0, src0{9}, ?); // acc(0)
+ let Inst{60} = !if(P.HasSrc1, src1{9}, ?); // acc(1)
let Inst{63-61} = !if(P.HasSrc1, blgp, 0);
}
@@ -631,12 +636,12 @@ class VOP3PXe <bits<7> op, VOPProfile MFMAPfl, bit acc_cd = 0> : Enc128, VOP3Pe_
let Inst{86-80} = op;
let Inst{95-87} = 0x1a7; //encoding
- let Inst{104-96} = !if(MFMAPfl.HasSrc0, src0{8-0}, 0);
- let Inst{113-105} = !if(MFMAPfl.HasSrc1, src1{8-0}, 0);
- let Inst{122-114} = !if(MFMAPfl.HasSrc2, src2, 0);
+ let Inst{104-96} = !if(MFMAPfl.HasSrc0, src0{8-0}, ?);
+ let Inst{113-105} = !if(MFMAPfl.HasSrc1, src1{8-0}, ?);
+ let Inst{122-114} = !if(MFMAPfl.HasSrc2, src2, ?);
- let Inst{123} = !if(MFMAPfl.HasSrc0, src0{9}, 0); // acc(0)
- let Inst{124} = !if(MFMAPfl.HasSrc1, src1{9}, 0); // acc(1)
+ let Inst{123} = !if(MFMAPfl.HasSrc0, src0{9}, ?); // acc(0)
+ let Inst{124} = !if(MFMAPfl.HasSrc1, src1{9}, ?); // acc(1)
let Inst{127-125} = !if(MFMAPfl.HasSrc1, blgp, 0);
}
@@ -698,7 +703,7 @@ class VOP_SDWAe<VOPProfile P> : Enc64 {
bits<2> dst_unused;
bits<1> clamp;
- let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0);
+ let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, ?);
let Inst{42-40} = !if(P.EmitDstSel, dst_sel{2-0}, ?);
let Inst{44-43} = !if(P.EmitDstSel, dst_unused{1-0}, ?);
let Inst{45} = !if(P.HasSDWAClamp, clamp{0}, 0);
@@ -732,11 +737,11 @@ class VOP_SDWA9e<VOPProfile P> : Enc64 {
bits<5> src1_modifiers;
bits<1> src1_sgpr;
- let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0);
+ let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, ?);
let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, 0);
let Inst{51} = !if(P.HasSrc0IntMods, src0_modifiers{4}, 0);
let Inst{53-52} = !if(P.HasSrc0FloatMods, src0_modifiers{1-0}, 0);
- let Inst{55} = !if(P.HasSrc0, src0{8}, 0);
+ let Inst{55} = !if(P.HasSrc0, src0{8}, ?);
let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, 0);
let Inst{59} = !if(P.HasSrc1IntMods, src1_modifiers{4}, 0);
let Inst{61-60} = !if(P.HasSrc1FloatMods, src1_modifiers{1-0}, 0);
@@ -765,16 +770,9 @@ class VOP_SDWA9Be<VOPProfile P> : VOP_SDWA9e<P> {
}
class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
- InstSI <P.OutsSDWA, P.InsSDWA, "", pattern>,
- VOP <opName>,
- SIMCInstr <opName#"_sdwa", SIEncodingFamily.NONE> {
-
- let isPseudo = 1;
- let isCodeGenOnly = 1;
- let UseNamedOperandTable = 1;
+ VOP_Pseudo <opName, "_sdwa", P, P.OutsSDWA, P.InsSDWA, "", pattern> {
- string Mnemonic = opName;
- string AsmOperands = P.AsmSDWA;
+ let AsmOperands = P.AsmSDWA;
string AsmOperands9 = P.AsmSDWA9;
let Size = 8;
@@ -794,8 +792,6 @@ class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
let AsmVariantName = !if(P.HasExtSDWA, AMDGPUAsmVariants.SDWA,
AMDGPUAsmVariants.Disable);
let DecoderNamespace = "GFX8";
-
- VOPProfile Pfl = P;
}
class VOP_SDWA8_Real <VOP_SDWA_Pseudo ps> :
@@ -889,7 +885,7 @@ class VOP_DPPe<VOPProfile P, bit IsDPP16=0> : Enc64 {
bits<4> row_mask;
bit fi;
- let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0);
+ let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, ?);
let Inst{48-40} = dpp_ctrl;
let Inst{50} = !if(IsDPP16, fi, ?);
let Inst{51} = bound_ctrl;
@@ -954,8 +950,8 @@ class VOP3_DPPe_Common<bits<10> op, VOPProfile P> : VOP3_DPPe_Common_Base<op, P>
bits<9> src2;
let Inst{7-0} = !if(P.EmitDst, vdst{7-0}, 0);
- let Inst{49-41} = !if(P.HasSrc1, src1, 0);
- let Inst{58-50} = !if(P.HasSrc2, src2, 0);
+ let Inst{49-41} = !if(P.HasSrc1, src1, ?);
+ let Inst{58-50} = !if(P.HasSrc2, src2, ?);
}
class VOP3_DPPe_Common_t16<bits<10> op, VOPProfile P> : VOP3_DPPe_Common_Base<op, P> {
@@ -964,8 +960,8 @@ class VOP3_DPPe_Common_t16<bits<10> op, VOPProfile P> : VOP3_DPPe_Common_Base<op
bits<11> src2;
let Inst{7-0} = !if(P.EmitDst, vdst{7-0}, 0);
- let Inst{49-41} = !if(P.HasSrc1, src1{8-0}, 0);
- let Inst{58-50} = !if(P.HasSrc2, src2{8-0}, 0);
+ let Inst{49-41} = !if(P.HasSrc1, src1{8-0}, ?);
+ let Inst{58-50} = !if(P.HasSrc2, src2{8-0}, ?);
}
class VOP3P_DPPe_Common_Base<bits<8> op, VOPProfile P> : Enc96 {
@@ -998,8 +994,8 @@ class VOP3P_DPPe_Common<bits<8> op, VOPProfile P> : VOP3P_DPPe_Common_Base<op, P
bits<9> src2;
let Inst{7-0} = vdst;
- let Inst{49-41} = !if(P.HasSrc1, src1, 0);
- let Inst{58-50} = !if(P.HasSrc2, src2, 0);
+ let Inst{49-41} = !if(P.HasSrc1, src1, ?);
+ let Inst{58-50} = !if(P.HasSrc2, src2, ?);
}
class VOP3P_DPPe_Common_t16<bits<8> op, VOPProfile P> : VOP3P_DPPe_Common_Base<op, P> {
@@ -1008,8 +1004,8 @@ class VOP3P_DPPe_Common_t16<bits<8> op, VOPProfile P> : VOP3P_DPPe_Common_Base<o
bits<11> src2;
let Inst{7-0} = vdst{7-0};
- let Inst{49-41} = !if(P.HasSrc1, src1{8-0}, 0);
- let Inst{58-50} = !if(P.HasSrc2, src2{8-0}, 0);
+ let Inst{49-41} = !if(P.HasSrc1, src1{8-0}, ?);
+ let Inst{58-50} = !if(P.HasSrc2, src2{8-0}, ?);
}
class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[],
@@ -1134,7 +1130,7 @@ class VOP3_DPP_Enc <bits<10> op, VOPProfile P, bit IsDPP16> :
VOP3_DPPe_Fields {
let Inst{40-32} = 0xfa;
- let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0);
+ let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, ?);
let Inst{80-72} = dpp_ctrl;
let Inst{82} = !if(IsDPP16, fi, ?);
let Inst{83} = bound_ctrl;
@@ -1154,7 +1150,7 @@ class VOP3_DPP_Enc_t16<bits<10> op, VOPProfile P, bit IsDPP16 >
VOP3_DPPe_Fields_t16 {
let Inst{40-32} = 0xfa;
- let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0);
+ let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, ?);
let Inst{80-72} = dpp_ctrl;
let Inst{82} = !if(IsDPP16, fi, ?);
let Inst{83} = bound_ctrl;
@@ -1180,7 +1176,7 @@ class VOP3P_DPP <bits<8> op, string OpName, VOPProfile P, bit IsDPP16,
let VOP3P = 1;
let Inst{40-32} = 0xfa;
- let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0);
+ let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, ?);
let Inst{80-72} = dpp_ctrl;
let Inst{82} = !if(IsDPP16, fi, ?);
let Inst{83} = bound_ctrl;
@@ -1195,7 +1191,7 @@ class VOP_DPP8e<VOPProfile P> : Enc64 {
bits<24> dpp8;
bits<9> fi;
- let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0);
+ let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, ?);
let Inst{63-40} = dpp8{23-0};
}
@@ -1246,7 +1242,7 @@ class VOP3_DPP8_Enc <bits<10> op, VOPProfile P> :
VOP3_DPPe_Common<op, P>,
VOP3_DPP8e_Fields {
let Inst{40-32} = fi;
- let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0);
+ let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, ?);
let Inst{95-72} = dpp8{23-0};
}
@@ -1257,7 +1253,7 @@ class VOP3_DPP8_Enc_t16 <bits<10> op, VOPProfile P> :
VOP3_DPPe_Common_t16<op, P>,
VOP3_DPP8e_Fields_t16 {
let Inst{40-32} = fi;
- let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0);
+ let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, ?);
let Inst{95-72} = dpp8{23-0};
}
@@ -1270,7 +1266,7 @@ class VOP3P_DPP8<bits<8> op, string OpName, VOPProfile P> :
let VOP3P = 1;
let Inst{40-32} = fi;
- let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0);
+ let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, ?);
let Inst{95-72} = dpp8{23-0};
}
@@ -1357,8 +1353,12 @@ class VOPBinOpClampPat<SDPatternOperator node, Instruction inst, ValueType vt> :
class getVOP3ModPat<VOPProfile P, SDPatternOperator node> {
dag src0 = !if(P.HasOMod,
- (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
- (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp));
+ !if(P.HasClamp,
+ (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
+ (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i32:$omod)),
+ !if(P.HasClamp,
+ (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp),
+ (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers)));
list<dag> ret3 = [(set P.DstVT:$vdst,
(DivergentFragOrOp<node, P>.ret (P.Src0VT src0),
@@ -1873,6 +1873,12 @@ multiclass VOP3_Real_with_name<GFXGen Gen, bits<10> op, string opName,
}
}
+multiclass VOP3_Real_with_name_gfx11_gfx12_gfx13<
+ bits<10> op, string opName, string asmName, string pseudo_mnemonic = "", bit isSingle = 0> :
+ VOP3_Real_with_name<GFX11Gen, op, opName, asmName, pseudo_mnemonic, isSingle>,
+ VOP3_Real_with_name<GFX12Gen, op, opName, asmName, pseudo_mnemonic, isSingle>,
+ VOP3_Real_with_name<GFX13Gen, op, opName, asmName, pseudo_mnemonic, isSingle>;
+
// for READLANE/WRITELANE
multiclass VOP3_Real_No_Suffix<GFXGen Gen, bits<10> op, string opName = NAME> {
defvar ps = !cast<VOP_Pseudo>(opName);
@@ -2204,12 +2210,12 @@ include "VOP3PInstructions.td"
include "VOPDInstructions.td"
class ClassPat<Instruction inst, ValueType vt> : GCNPat <
- (is_fpclass (vt (VOP3ModsNonCanonicalizing vt:$src0, i32:$src0_mods)), (i32 timm:$mask)),
+ (i1 (is_fpclass (vt (VOP3ModsNonCanonicalizing vt:$src0, i32:$src0_mods)), (i32 timm:$mask))),
(inst i32:$src0_mods, vt:$src0, (V_MOV_B32_e32 timm:$mask))
>;
class ClassPat_t16<Instruction inst, ValueType vt> : GCNPat <
- (is_fpclass (vt (VOP3ModsNonCanonicalizing vt:$src0, i32:$src0_mods)), (i32 timm:$mask)),
+ (i1 (is_fpclass (vt (VOP3ModsNonCanonicalizing vt:$src0, i32:$src0_mods)), (i32 timm:$mask))),
(inst i32:$src0_mods, vt:$src0, SRCMODS.NONE, (V_MOV_B32_e32 timm:$mask))
>;
@@ -2274,3 +2280,12 @@ def VOPTrue16Table : GenericTable {
let PrimaryKey = ["Opcode"];
let PrimaryKeyName = "getTrue16OpcodeHelper";
}
+
+def DPMACCInstructionTable : GenericTable {
+ let FilterClass = "VOP_Pseudo";
+ let CppTypeName = "DPMACCInstructionInfo";
+ let Fields = ["Opcode", "IsDPMACCInstruction"];
+
+ let PrimaryKey = ["Opcode"];
+ let PrimaryKeyName = "getDPMACCInstructionHelper";
+}